Ejemplo n.º 1
0
def run_benchmark_tskit(args):

    before = time.perf_counter()
    ts = msprime.load(args.input)
    duration = time.perf_counter() - before
    print("Loaded in {:.2f}s".format(duration))

    print("num_nodes = ", ts.num_nodes)
    print("num_edges = ", ts.num_edges)
    print("num_trees = ", ts.num_trees)
    print("size = ",
          humanize.naturalsize(os.path.getsize(args.input), binary=True))

    before = time.perf_counter()
    j = 0
    for tree in ts.trees(sample_counts=False):
        j += 1
    assert j == ts.num_trees
    duration = time.perf_counter() - before
    print("Iterated over trees in {:.2f}s".format(duration))

    before = time.perf_counter()
    num_variants = 0
    # As of msprime 0.6.1, it's a little bit more efficient to specify the full
    # samples and use the tree traversal based decoding algorithm than the full
    # sample-lists for UKBB trees. This'll be fixed in the future.
    for var in ts.variants(samples=ts.samples()):
        if num_variants == args.num_variants:
            break
        num_variants += 1
    duration = time.perf_counter() - before
    total_genotypes = (ts.num_samples * num_variants) / 10**6
    print("Iterated over {} variants in {:.2f}s @ {:.2f} M genotypes/s".format(
        num_variants, duration, total_genotypes / duration))
Ejemplo n.º 2
0
def run_snip_centromere(args):
    with open(args.centromeres) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row["chrom"] == args.chrom:
                start = int(row["start"])
                end = int(row["end"])
                break
        else:
            raise ValueError("Did not find row")
    ts = msprime.load(args.input)
    position = ts.tables.sites.position
    s_index = np.searchsorted(position, start)
    e_index = np.searchsorted(position, end)
    # We have a bunch of sites within the centromere. Get the largest
    # distance between these and call these the start and end. Probably
    # pointless having the centromere coordinates as input in the first place,
    # since we're just searching for the largest gap anyway. However, it can
    # be useful in UKBB, since it's perfectly possible that the largest
    # gap between sites isn't in the centromere.
    X = position[s_index:e_index + 1]
    j = np.argmax(X[1:] - X[:-1])
    real_start = X[j] + 1
    real_end = X[j + 1]
    print("Centromere at", start, end, "Snipping topology from ", real_start,
          real_end)
    snipped_ts = tsinfer.snip_centromere(ts, real_start, real_end)
    snipped_ts.dump(args.output)
Ejemplo n.º 3
0
def run_trees(args):
    tree_sequence = msprime.load(args.history_file)
    N = tree_sequence.get_num_nodes()
    records = list(tree_sequence.records())
    l = [record[0] for record in records]
    r = [record[1] for record in records]
    u = [record[2] for record in records]
    c = [record[3] for record in records]
    t = [record[4] for record in records]
    local_trees = []
    print("Trees:")
    for pi in generate_trees(l, r, u, c, t):
        local_trees.append(list(pi))
        print("\t", pi)
    local_counts = []
    S = set(range(1, tree_sequence.get_sample_size() + 1))
    print("Counts:")
    for pi, beta in count_leaves(l, r, u, c, t, S):
        local_counts.append(list(beta))
        print("\t", beta)
    msp_trees = []
    msp_counts = []
    for t in tree_sequence.trees():
        pi = [t.get_parent(j) for j in range(N + 1)]
        beta = [t.get_num_leaves(j) for j in range(N + 1)]
        msp_trees.append(pi)
        msp_counts.append(beta)
    assert msp_trees == local_trees
    assert msp_counts == local_counts
Ejemplo n.º 4
0
def ts_to_bcf_single(ts_file, out_file, runner):
    # Need to remove non-sample individuals from ts or else tskit gets confused
    ts = msprime.load(ts_file)
    if ts.num_individuals == 0:
        bcf_cmd = "tskit vcf --ploidy 2 {} | bcftools view -O b > {}".format(
            ts_file, out_file)
        runner.run(bcf_cmd)
    else:
        ts_only_sample_inds = ts_clean_inds(ts)
        # TODO: This will probably fail if metadata isn't present
        ids = [
            ind.metadata.decode('utf8')
            for ind in ts_only_sample_inds.individuals()
        ]

        read_fd, write_fd = os.pipe()
        write_pipe = os.fdopen(write_fd, "w")
        with open(out_file, "w") as f:
            proc = subprocess.Popen(["bcftools", "view", "-O", "b"],
                                    stdin=read_fd,
                                    stdout=f)
            ts_only_sample_inds.write_vcf(write_pipe, individual_names=ids)
            write_pipe.close()
            os.close(read_fd)
            proc.wait()
            if proc.returncode != 0:
                raise RuntimeError("bcftools failed with status:",
                                   proc.returncode)
Ejemplo n.º 5
0
def run_simplify_subsample_size_benchmark(args):
    ts = msprime.load(args.file)
    np.random.seed(1)
    print("Running simplify benchmarks")
    N = 20
    num_replicates = 5
    subsample_size = np.logspace(1, 5, N).astype(int)
    print(subsample_size)
    T = np.zeros(N)
    for j in range(N):
        X = np.zeros(num_replicates)
        for k in range(num_replicates):
            sample = np.random.choice(ts.num_samples,
                                      subsample_size[j],
                                      replace=False).astype(np.int32)
            before = time.process_time()
            sub_ts = ts.simplify(sample)
            X[k] = time.process_time() - before
        T[j] = np.mean(X)
        print(subsample_size[j], T[j])

    df = pd.DataFrame({"subsample_size": subsample_size, "time": T})
    df.to_csv("data/simplify_subsample.dat")

    plt.semilogx(subsample_size, T, marker="o")
    plt.xlabel("Subsample size")
    plt.ylabel("Time to simplify (s)")
    plt.savefig("simplify_subsample_perf.pdf", format='pdf')
Ejemplo n.º 6
0
def run_trees(args):
    tree_sequence = msprime.load(args.history_file)
    N = tree_sequence.get_num_nodes()
    records = list(tree_sequence.records())
    l = [record[0] for record in records]
    r = [record[1] for record in records]
    u = [record[2] for record in records]
    c = [record[3] for record in records]
    t = [record[4] for record in records]
    local_trees = []
    print("Trees:")
    for pi in generate_trees(l, r, u, c, t):
        local_trees.append(list(pi))
        print("\t", pi)
    local_counts = []
    S = set(range(1, tree_sequence.get_sample_size() + 1))
    print("Counts:")
    for pi, beta in count_leaves(l, r, u, c, t, S):
        local_counts.append(list(beta))
        print("\t", beta)
    msp_trees = []
    msp_counts = []
    for t in tree_sequence.trees():
        pi = [t.get_parent(j) for j in range(N + 1)]
        beta = [t.get_num_leaves(j) for j in range(N + 1)]
        msp_trees.append(pi)
        msp_counts.append(beta)
    assert msp_trees == local_trees
    assert msp_counts == local_counts
def run_benchmark():
    print("msprime version:", msprime.__version__)

    before = time.perf_counter()
    filename = os.path.join(data_prefix, "{}.trees".format(10**7))
    ts = msprime.load(filename)
    duration = time.perf_counter() - before
    print("loaded {} tree sequence in {:.2f}s".format(
        humanize.naturalsize(os.path.getsize(filename), binary=True), duration))

    run_benchmark_newick(ts, 2)

    size = ts.num_samples * ts.num_sites
    print("Total size of genotype matrix = ", humanize.naturalsize(size, binary=True))

    before = time.perf_counter()
    j = 0
    for tree in ts.trees(sample_counts=False, sample_lists=False):
        j += 1
    assert j == ts.num_trees
    duration = time.perf_counter() - before
    print("Iterated over {} trees in {:.2f}s".format(ts.num_trees, duration))

    before = time.perf_counter()
    samples = np.arange(10**6)
    freq = np.zeros(ts.num_sites)
    for tree in ts.trees(tracked_samples=samples):
        for site in tree.sites():
            node = site.mutations[0].node
            freq[site.id] = tree.num_tracked_samples(node)
    duration = time.perf_counter() - before
    print("Computed {} allele frequencies in {:.2f}s".format(ts.num_sites, duration))

    benchmark_bcf(ts)
Ejemplo n.º 8
0
def main(args):
    nhaps = map(int, args.nhaps.split(','))
    recomb = args.recomb_map
    ncausal = args.ncausal
    
    # generate/load coalescent simulations
    if args.tree is None:
        (pop_config, mig_mat, demog) = out_of_africa(nhaps)
        simulation = simulate_ooa(pop_config, mig_mat, demog, recomb)
        simulation.dump(args.out+ '_nhaps_' + '_'.join(map(str, nhaps)) + '.hdf5', True)
    else:
        simulation = msprime.load(args.tree)
    
    eprint(simulation)
    eprint('Number of haplotypes: ' + ','.join(map(str, nhaps)))
    eprint('Number of trees: ' + str(simulation.get_num_trees()))
    eprint('Number of mutations: ' + str(simulation.get_num_mutations()))
    eprint('Sequence length: ' + str(simulation.get_sequence_length()))

    
    prs_true = true_prs(simulation, args.ncausal, args.h2, nhaps, args.out)
    cases_diploid, controls_diploid, prs_norm, environment = case_control(prs_true, args.h2, nhaps, args.prevalence, args.ncontrols, args.out)
    summary_stats, cases_haploid, controls_haploid = run_gwas(simulation, cases_diploid, controls_diploid, args.p_threshold, args.cc_maf)
    clumped_snps, usable_positions = clump_variants(simulation, summary_stats, nhaps, args.r2, args.window_size)
    prs_infer = infer_prs(simulation, nhaps, clumped_snps, summary_stats, usable_positions, args.h2, args.ncausal, args.out)
    write_summaries(args.out, prs_true, prs_infer, nhaps, cases_diploid, controls_diploid, args.h2, args.ncausal, environment)
Ejemplo n.º 9
0
def newick_example():
    tree_sequence = msprime.load("example.hdf5")
    with open("example.newick", "w") as f:
        iterator = tree_sequence.newick_trees(8)
        for l, ns in iterator:
            print("[{0}]".format(l), end="", file=f)
            print(ns, file=f)
Ejemplo n.º 10
0
 def verify_round_trip(self, ts, version):
     msprime.dump_legacy(ts, self.temp_file, version=version)
     with silence_stderr():
         tsp = msprime.load_legacy(self.temp_file)
     self.verify_tree_sequences_equal(ts, tsp)
     tsp.dump(self.temp_file)
     tsp = msprime.load(self.temp_file)
     self.verify_tree_sequences_equal(ts, tsp)
Ejemplo n.º 11
0
 def test_simulate_short_args(self):
     cmd = "simulate"
     stdout, stdearr = capture_output(cli.msp_main, [
         cmd, "100", self._history_file, "-m", "1e2", "-r", "5", "-u", "2"])
     tree_sequence = msprime.load(self._history_file)
     self.assertEqual(tree_sequence.get_sample_size(), 100)
     self.assertEqual(tree_sequence.get_num_loci(), 100)
     self.assertGreater(tree_sequence.get_num_mutations(), 0)
Ejemplo n.º 12
0
 def verify_round_trip(self, ts, version):
     msprime.dump_legacy(ts, self.temp_file, version=version)
     tsp = msprime.load_legacy(self.temp_file)
     simplify = version < 10
     self.verify_tree_sequences_equal(ts, tsp, simplify=simplify)
     tsp.dump(self.temp_file)
     tsp = msprime.load(self.temp_file)
     self.verify_tree_sequences_equal(ts, tsp, simplify=simplify)
Ejemplo n.º 13
0
def run_dump_provenances(args):
    tree_sequence = msprime.load(args.history_file)
    if args.human:
        for provenance in tree_sequence.provenances():
            d = json.loads(provenance.record)
            print("id={}, timestamp={}, record={}".format(
                provenance.id, provenance.timestamp, json.dumps(d, indent=4)))
    else:
        tree_sequence.dump_text(provenances=sys.stdout)
Ejemplo n.º 14
0
def convert_haplotypes():
    import sys
    ts = msprime.load(sys.argv[1])
    ts.generate_mutations(0.0001, 1)
    print("Generated mutations", ts.get_num_mutations())
    c = 0
    for h in ts.haplotypes():
        c += 1
        # print(h)
    print("generated ", c, "haplotypes")
Ejemplo n.º 15
0
 def verify_output(self, output_path):
     output_ts = msprime.load(output_path)
     self.assertEqual(output_ts.num_samples, self.input_ts.num_samples)
     self.assertEqual(output_ts.sequence_length,
                      self.input_ts.sequence_length)
     self.assertEqual(output_ts.num_sites, self.input_ts.num_sites)
     self.assertGreater(output_ts.num_sites, 1)
     self.assertTrue(
         np.array_equal(output_ts.genotype_matrix(),
                        self.input_ts.genotype_matrix()))
Ejemplo n.º 16
0
 def test_optional_provenance(self):
     ts = single_locus_no_mutation_example()
     with tempfile.NamedTemporaryFile() as f:
         ts.dump(f.name)
         hfile = h5py.File(f.name, "r+")
         del hfile["provenance"]
         hfile.close()
         del hfile
         other_ts = msprime.load(f.name)
         self.assertEqual(other_ts.get_provenance(), [])
Ejemplo n.º 17
0
 def test_optional_provenance(self):
     ts = single_locus_no_mutation_example()
     with tempfile.NamedTemporaryFile() as f:
         ts.dump(f.name)
         hfile = h5py.File(f.name, "r+")
         del hfile["provenance"]
         hfile.close()
         del hfile
         other_ts = msprime.load(f.name)
         self.assertEqual(other_ts.get_provenance(), [])
Ejemplo n.º 18
0
def run_newick(args):
    ts = msprime.load(args.file)
    t = next(ts.trees())
    newick = t.newick()
    size = len(newick)
    megabyte = 1024 * 1024
    terabyte = megabyte * 1024 * 1024
    total = size * ts.num_trees
    print("newick size 1 tree    = {:.2f} MiB".format(size / megabyte))
    print("newick size all trees = {:.2f} TiB".format(total / terabyte))
Ejemplo n.º 19
0
def dump_example():
    tree_sequence = msprime.simulate(
        sample_size=10, num_loci=1000, scaled_recombination_rate=0.1,
        scaled_mutation_rate=0.01, random_seed=1)
    haplotypes = list(tree_sequence.haplotypes())
    tree_sequence.dump("example.hdf5")
    # Now, load another tree sequence instance from this file
    other_tree_sequence = msprime.load("example.hdf5")
    other_haplotypes = list(other_tree_sequence.haplotypes())
    assert haplotypes == other_haplotypes
Ejemplo n.º 20
0
 def test_optional_provenance(self):
     ts = single_locus_no_mutation_example()
     ts.dump(self.temp_file)
     hfile = h5py.File(self.temp_file, "r+")
     del hfile["provenance"]
     hfile.close()
     del hfile
     other_ts = msprime.load(self.temp_file)
     self.assertEqual(other_ts.get_provenance(), [])
     self.verify_tree_dump_format(other_ts)
Ejemplo n.º 21
0
    def load(cls, path):
        '''
        Load a :class:`SlimTreeSequence` from a .trees file on disk.

        :param string path: The path to a .trees file.
        :rtype SlimTreeSequence:
        '''
        # roundabout way to load just the tables
        ts = msprime.load(path)
        return cls(ts)
def write_treeseq(chrom):
    treefile = args.tree_file[chrom]
    mut_rate = args.mut_rate[chrom]
    seed = seeds[chrom]
    logfile.write("Simulating mutations on " + treefile + "\n")
    logfile.flush()
    ts = msprime.load(treefile)
    mutated_ts = msprime.mutate(ts, rate=mut_rate, random_seed=seed, keep=True)
    logfile.write("Saving to" + args.outfile[chrom] + "\n")
    mutated_ts.dump(args.outfile[chrom])
    return True
Ejemplo n.º 23
0
 def verify_dump_load(self, tree_sequence):
     """
     Dump the tree sequence and verify we can load again from the same
     file.
     """
     with tempfile.NamedTemporaryFile() as f:
         tree_sequence.dump(f.name)
         other = msprime.load(f.name)
     records = list(tree_sequence.records())
     other_records = list(other.records())
     self.assertEqual(records, other_records)
Ejemplo n.º 24
0
def write_vcf(chrom):
    treefile = args.tree_file[chrom]
    vcf = open(args.vcffile[chrom], "w")
    mut_rate = args.mut_rate[chrom]
    seed = seeds[chrom]
    logfile.write("Simulating mutations on" + treefile + "\n")
    ts = msprime.load(treefile)
    mutated_ts = msprime.mutate(ts, rate=mut_rate, random_seed=seed)
    logfile.write("Saving to" + args.vcffile[chrom] + "\n")
    mutated_ts.write_vcf(vcf, ploidy=1)
    return True
Ejemplo n.º 25
0
def gwas_example():
    # n = 100
    # ts = msprime.simulate(
    #     n, 1000, scaled_recombination_rate=0.1, scaled_mutation_rate=10,
    #     random_seed=1)
    ts = msprime.load("tmp__NOBACKUP__/gqt.hdf5")
    n = ts.get_sample_size()
    num_cases = n // 2
    # write_ped(ts, num_cases, "tmp__NOBACKUP__/test")
    # write_ped(ts, num_cases, "tmp__NOBACKUP__/plink/gqt")
    write_plink_assoc(ts, num_cases)
Ejemplo n.º 26
0
def tw_find_segment(chunk):
    tmpf = "tmp." + ''.join(
        random.SystemRandom().choice(string.ascii_uppercase +
                                     string.ascii_lowercase + string.digits)
        for _ in range(32)) + ".hdf5"
    copyfile(chunk['Data'], tmpf)
    data = msp.load(chunk['Data'])
    rslt = []
    for line in chunk['List']:
        rslt.append(find_segment(line, data))
    os.remove(tmpf)
    return rslt
Ejemplo n.º 27
0
    def test_run_defaults(self):
        cmd = "simulate"
        sample_size = 10
        stdout, stderr = capture_output(cli.msp_main, [
            cmd, str(sample_size), self._history_file])
        self.assertEqual(len(stderr), 0)
        self.assertEqual(len(stdout), 0)

        tree_sequence = msprime.load(self._history_file)
        self.assertEqual(tree_sequence.get_sample_size(), sample_size)
        self.assertEqual(tree_sequence.get_num_loci(), 1)
        self.assertEqual(tree_sequence.get_num_mutations(), 0)
Ejemplo n.º 28
0
def run_dump_macs(args):
    """
    Write a macs formatted file so we can import into pbwt.
    """
    tree_sequence = msprime.load(args.history_file)
    n = tree_sequence.get_sample_size()
    m = tree_sequence.get_sequence_length()
    print("COMMAND:\tnot_macs {} {}".format(n, m))
    print("SEED:\tASEED")
    site = 0
    for position, variant in tree_sequence.variants():
        print("SITE:", site, position / m, 0.0, variant, sep="\t")
        site += 1
Ejemplo n.º 29
0
def run_dump_macs(args):
    """
    Write a macs formatted file so we can import into pbwt.
    """
    tree_sequence = msprime.load(args.history_file)
    n = tree_sequence.get_sample_size()
    m = tree_sequence.get_sequence_length()
    print("COMMAND:\tnot_macs {} {}".format(n, m))
    print("SEED:\tASEED")
    for variant in tree_sequence.variants(as_bytes=True):
        print(
            "SITE:", variant.index, variant.position / m, 0.0,
            "{}".format(variant.genotypes.decode()), sep="\t")
Ejemplo n.º 30
0
 def verify_dump_load(self, tree_sequence):
     """
     Dump the tree sequence and verify we can load again from the same
     file.
     """
     tree_sequence.dump(self.temp_file)
     other = msprime.load(self.temp_file)
     self.assertIsNotNone(other.file_uuid)
     records = list(tree_sequence.edges())
     other_records = list(other.edges())
     self.assertEqual(records, other_records)
     haplotypes = list(tree_sequence.haplotypes())
     other_haplotypes = list(other.haplotypes())
     self.assertEqual(haplotypes, other_haplotypes)
Ejemplo n.º 31
0
 def verify_dump_load(self, tree_sequence):
     """
     Dump the tree sequence and verify we can load again from the same
     file.
     """
     tree_sequence.dump(self.temp_file)
     other = msprime.load(self.temp_file)
     self.assertIsNotNone(other.file_uuid)
     records = list(tree_sequence.edges())
     other_records = list(other.edges())
     self.assertEqual(records, other_records)
     haplotypes = list(tree_sequence.haplotypes())
     other_haplotypes = list(other.haplotypes())
     self.assertEqual(haplotypes, other_haplotypes)
Ejemplo n.º 32
0
def run_mcmc(args):
    input_data_path = args.input_path
    haplotype_data_name = args.haplotype_name
    ancAllele_data_name = args.ancAllele_name
    snpPos_data_name= args.snpPos_name
    iteration = args.iteration
    thin = args.thin
    burn = args.burn
    n = args.sample_size
    seq_length = args.seq_length
    mu = args.mutation_rate
    r= args.recombination_rate
    Ne= args.Ne
    outpath = args.outpath
    tsfull = None
    if args.tsfull !=None:#else real data
        try:
            tsfull = msprime.load(args.tsfull.name) #trees is a fh
        except AttributeError:
            tsfull = msprime.load(args.tsfull)
    # random.seed(args.random_seed)
    # np.random.seed(args.random_seed+1)
    mcmc = MCMC(tsfull, n, Ne, seq_length, mu, r,
                 input_data_path,
                 haplotype_data_name,
                 ancAllele_data_name,
                 snpPos_data_name, outpath, args.verbose)
    mcmc.run(iteration, thin, burn, args.verify)
    if args.plot:
        # p= comparison.plot.Trace(outpath, name= "summary")
        p= Trace(outpath)
        p.arginfer_trace()
    # if args.plot:
    #     p = plot_summary(outpath)
    #     p.plot()
    if args.verbose:
        mcmc.print_state()
Ejemplo n.º 33
0
Archivo: dev.py Proyecto: td329/msprime
def vcf_example():

    # n = 6 # 3 diploid samples from each pop
    # t = 100
    # ts = msprime.simulate(
    #     Ne=10**4,
    #     population_configurations=[
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n)],
    #     demographic_events=[
    #         msprime.MassMigration(time=t, source=1, destination=0),
    #         msprime.MassMigration(time=t, source=2, destination=0),
    #         msprime.MassMigration(time=t, source=3, destination=0),
    #         msprime.MassMigration(time=t, source=4, destination=0)],
    #     length=1 * 1e6,
    #     recombination_rate=2e-8,
    #     mutation_rate=2e-8,
    #     random_seed=1)
    # with open("test.vcf", "w") as f:

    #     ts.write_vcf(f, ploidy=2)

    ts = msprime.load("tmp__NOBACKUP__/populations.hdf5")
    before = time.clock()
    num_genotypes = 0
    for variant in ts.variants():
        num_genotypes += len(variant.genotypes)
    print(num_genotypes, ts.get_sample_size() * ts.get_num_mutations())
    duration = time.clock() - before
    print("Done in ", duration, " gives ", num_genotypes * 1e-6 / duration,
          " MGenotypes decoded per second")
    print(num_genotypes)

    before = time.clock()
    with open("tmp__NOBACKUP__/tmp_1.vcf", "w") as f:
        ts.write_vcf(f, ploidy=1)
        size = f.tell()
    duration = time.clock() - before
    print("wrote vcf in ", duration, "seconds", (size / 2**20) / duration,
          "MB/s")
    before = time.clock()
    with open("tmp__NOBACKUP__/tmp_2.vcf", "w") as f:
        ts.write_vcf(f, ploidy=2)
    duration = time.clock() - before
    print("wrote vcf in ", duration, "seconds", (size / 2**20) / duration,
          "MB/s")
Ejemplo n.º 34
0
def write_vcf(chrom):
    treefile = args.tree_file[chrom]
    vcf = open(args.vcffile[chrom], "w")
    mut_rate = args.mut_rate[chrom]
    seed = seeds[chrom]
    logfile.write("Simulating mutations on" + treefile + "\n")
    ts = msprime.load(treefile)
    tables = ts.dump_tables()
    rng = msprime.RandomGenerator(seed)
    mutgen = msprime.MutationGenerator(rng, mut_rate)
    mutgen.generate(tables.nodes, tables.edges, tables.sites, tables.mutations)
    logfile.write("Saving to" + args.vcffile[chrom] + "\n")
    mutated_ts = msprime.load_tables(**tables.asdict())
    mutated_ts.write_vcf(vcf, ploidy=1)
    return True
Ejemplo n.º 35
0
def run_match_samples(args):
    setup_logging(args)

    sample_data = tsinfer.SampleData.load(args.input)
    ancestors_ts = get_ancestors_ts(args.ancestors_ts, args.input)
    output_ts = get_output_ts(args.output_ts, args.input)
    logger.info("Loading ancestral genealogies from {}".format(ancestors_ts))
    ancestors_ts = msprime.load(ancestors_ts)
    ts = tsinfer.match_samples(sample_data,
                               ancestors_ts,
                               num_threads=args.num_threads,
                               path_compression=not args.no_path_compression,
                               progress=args.progress)
    logger.info("Writing output tree sequence to {}".format(output_ts))
    ts.dump(output_ts)
Ejemplo n.º 36
0
def vcf_example():

    # n = 6 # 3 diploid samples from each pop
    # t = 100
    # ts = msprime.simulate(
    #     Ne=10**4,
    #     population_configurations=[
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n),
    #         msprime.PopulationConfiguration(sample_size=n)],
    #     demographic_events=[
    #         msprime.MassMigration(time=t, source=1, destination=0),
    #         msprime.MassMigration(time=t, source=2, destination=0),
    #         msprime.MassMigration(time=t, source=3, destination=0),
    #         msprime.MassMigration(time=t, source=4, destination=0)],
    #     length=1 * 1e6,
    #     recombination_rate=2e-8,
    #     mutation_rate=2e-8,
    #     random_seed=1)
    # with open("test.vcf", "w") as f:

    #     ts.write_vcf(f, ploidy=2)

    ts = msprime.load("tmp__NOBACKUP__/populations.hdf5")
    before = time.clock()
    num_genotypes = 0
    for variant in ts.variants():
        num_genotypes += len(variant.genotypes)
    print(num_genotypes, ts.get_sample_size() * ts.get_num_mutations())
    duration = time.clock() - before
    print("Done in ", duration, " gives ",
            num_genotypes * 1e-6 / duration, " MGenotypes decoded per second")
    print(num_genotypes)


    before = time.clock()
    with open("tmp__NOBACKUP__/tmp_1.vcf", "w") as f:
        ts.write_vcf(f, ploidy=1)
        size = f.tell()
    duration = time.clock() - before
    print("wrote vcf in ", duration, "seconds", (size / 2**20) / duration, "MB/s")
    before = time.clock()
    with open("tmp__NOBACKUP__/tmp_2.vcf", "w") as f:
        ts.write_vcf(f, ploidy=2)
    duration = time.clock() - before
    print("wrote vcf in ", duration, "seconds", (size / 2**20) / duration, "MB/s")
Ejemplo n.º 37
0
def write_indivs(chrom):
    treefile = args.tree_file[chrom]
    out = open(args.indivfile[chrom], "w")
    logfile.write("Reading trees from " + treefile + "\n")
    ts = msprime.load(treefile)
    node_inds = [np.where(ts.tables.nodes.individual == u)[0] 
                 for u in range(ts.tables.individuals.num_rows)]
    individuals = [slimIndividual(ts.tables.individuals[k]) 
                   for k in range(ts.tables.individuals.num_rows)]
    logfile.write("Saving to" + args.indivfile[chrom] + "\n")
    out.write("\t".join(header) + "\n");
    for k, ind in enumerate(individuals):
        data = [ind.ped_id, ind.age, ind.subpop] + list(ind.location) + list(node_inds[k])
        out.write("\t".join(map(str, data)) + "\n")
    out.close()
    return True
Ejemplo n.º 38
0
def run_dump_macs(args):
    """
    Write a macs formatted file so we can import into pbwt.
    """
    tree_sequence = msprime.load(args.history_file)
    n = tree_sequence.get_sample_size()
    m = tree_sequence.get_sequence_length()
    print("COMMAND:\tnot_macs {} {}".format(n, m))
    print("SEED:\tASEED")
    for variant in tree_sequence.variants(as_bytes=True):
        print("SITE:",
              variant.index,
              variant.position / m,
              0.0,
              "{}".format(variant.genotypes.decode()),
              sep="\t")
def run_convert_files():
    for k in range(1, 8):
        n = 10**k
        filename = os.path.join(data_prefix, "{}.trees".format(n))
        if not os.path.exists(filename):
            break
        ts = msprime.load(filename)
        filename += ".gz"
        if k < 7:
            filename = os.path.join(data_prefix, "{}.vcf".format(n))
            with open(filename, "w") as vcf_file:
                ts.write_vcf(vcf_file, 2)
            print("Wrote ", filename)
            gz_filename = filename + ".gz"
            subprocess.check_call("gzip -c {} > {}".format(filename, gz_filename), shell=True)
            print("Wrote ", gz_filename)
Ejemplo n.º 40
0
def run_compute_ukbb_gnn(args):
    ts = msprime.load(args.input)
    tables = ts.tables
    before = time.time()
    augmented_samples = set(get_augmented_samples(tables))
    duration = time.time() - before
    print("Got augmented:", len(augmented_samples), "in ", duration)

    reference_sets_map = collections.defaultdict(list)

    ind_metadata = [None for _ in range(ts.num_individuals)]
    all_samples = []
    for ind in ts.individuals():
        md = json.loads(ind.metadata.decode())
        ind_metadata[ind.id] = md
        for node in ind.nodes:
            if node not in augmented_samples:
                reference_sets_map[md["CentreName"]].append(node)
                all_samples.append(node)
    reference_set_names = list(reference_sets_map.keys())
    reference_sets = [reference_sets_map[key] for key in reference_set_names]

    cols = {
        "centre": [
            ind_metadata[ts.node(u).individual]["CentreName"]
            for u in all_samples
        ],
        "sample_id":
        [ind_metadata[ts.node(u).individual]["SampleID"] for u in all_samples],
        "ethnicity": [
            ind_metadata[ts.node(u).individual]["Ethnicity"]
            for u in all_samples
        ],
    }
    print("Computing GNNs for ", len(all_samples), "samples")
    before = time.time()
    A = ts.genealogical_nearest_neighbours(all_samples,
                                           reference_sets,
                                           num_threads=args.num_threads)
    duration = time.time() - before
    print("Done in {:.2f} mins".format(duration / 60))

    for j, name in enumerate(reference_set_names):
        cols[name] = A[:, j]
    df = pd.DataFrame(cols)
    df.to_csv(args.output)
Ejemplo n.º 41
0
    def __init__(self, max_time, ts=None, ts_file=None):
        if ts is None and ts_file is None:
            print("One of ts or ts_file must be specified")
            raise ValueError

        if ts is None:
            ts = msprime.load(ts_file)

        self.ts = ts
        self.max_time = max_time
        self.bps = list(self.ts.breakpoints())

        self.node_times = self.get_node_times()
        self.ca_times = scipy.sparse.lil_matrix((ts.num_samples, ts.num_samples))
        self.ca_last = scipy.sparse.lil_matrix((ts.num_samples, ts.num_samples))
        self.ca_count = scipy.sparse.lil_matrix((ts.num_samples, ts.num_samples))
        self.ibd_list = []
Ejemplo n.º 42
0
def load_tree_sequence(args, log):

    # Create a list to fill with tree_sequences.
    args, tree_sequence_list, tree_sequence_list_geno, m_total, m_geno_total, rec_map, m, m_start, m_geno, m_geno_start = initialise(
        args)
    tree_sequence_list.append(msprime.load(args.load_tree_sequence))
    args.n = int(tree_sequence_list[0].get_sample_size() / 2)
    N = args.n
    n_pops = 1

    log.log(
        "Warning: load tree sequence was included for debugging, we don't support more than 1 population, and more than 1 chromosome."
    )

    common_mutations = []
    n_haps = tree_sequence_list[0].get_sample_size()

    # Get the mutations > MAF.
    tree_sequence_list[0] = get_common_mutations_ts(args,
                                                    tree_sequence_list[0], log)

    m[0] = int(tree_sequence_list[0].get_num_mutations())
    m_start[0] = 0
    m_total = m[0]
    log.log('Number of mutations above MAF in the generated data: {m}'.format(
        m=m[0]))
    log.log('Running total of sites > MAF cutoff: {m}'.format(m=m_total))

    # If genotyped proportion is < 1.
    if args.geno_prop is not None:
        tree_sequence_tmp, m_geno_tmp = ts.set_mutations_in_tree(
            tree_sequence_list[0], args.geno_prop)
        tree_sequence_list_geno.append(tree_sequence_tmp)
        m_geno[0] = int(m_geno_tmp)
        m_geno_start[0] = m_geno_total
        m_geno_total = m_geno[0]
        log.log('Number of sites genotyped in the generated data: {m}'.format(
            m=m_geno[0]))
        log.log('Running total of sites genotyped: {m}'.format(m=m_geno_total))
    else:
        tree_sequence_list_geno.append(tree_sequence_list[0])
        m_geno[0] = m[0]
        m_geno_start[0] = m_start[0]
        m_geno_total = m_total

    return tree_sequence_list, tree_sequence_list_geno, m, m_start, m_total, m_geno, m_geno_start, m_geno_total, N, n_pops
Ejemplo n.º 43
0
def allele_frequency_example():
    # n = 10000
    # ts = msprime.simulate(
    #     n, 100000, scaled_recombination_rate=0.1, scaled_mutation_rate=0.1,
    #     random_seed=1)
    ts = msprime.load("tmp__NOBACKUP__/gqt.hdf5")
    n = ts.get_sample_size()
    num_mutations = 0
    min_frequency = 0.0001
    num_trees = 0
    for tree in ts.trees():
        num_trees += 1
        for pos, node in tree.mutations():
            if tree.get_num_leaves(node) / n < min_frequency:
                num_mutations += 1
    print("num_mutatinos = ", num_mutations, "\t", num_mutations / ts.get_num_mutations())
    print("total_mutations = ", ts.get_num_mutations())
    print("num_trees = ", num_trees)
Ejemplo n.º 44
0
def build_profile_inputs(n, num_megabases):
    L = num_megabases * 10**6
    input_file = "tmp__NOBACKUP__/profile-n={}-m={}.input.trees".format(
        n, num_megabases)
    if os.path.exists(input_file):
        ts = msprime.load(input_file)
    else:
        ts = msprime.simulate(
            n,
            length=L,
            Ne=10**4,
            recombination_rate=1e-8,
            mutation_rate=1e-8,
            random_seed=10,
        )
        print(
            "Ran simulation: n = ",
            n,
            " num_sites = ",
            ts.num_sites,
            "num_trees =",
            ts.num_trees,
        )
        ts.dump(input_file)
    filename = "tmp__NOBACKUP__/profile-n={}-m={}.samples".format(
        n, num_megabases)
    if os.path.exists(filename):
        os.unlink(filename)
    # daiquiri.setup(level="DEBUG")
    with tsinfer.SampleData(sequence_length=ts.sequence_length,
                            path=filename,
                            num_flush_threads=4) as sample_data:
        # progress_monitor = tqdm.tqdm(total=ts.num_samples)
        # for j in range(ts.num_samples):
        #     sample_data.add_sample(metadata={"name": "sample_{}".format(j)})
        #     progress_monitor.update()
        # progress_monitor.close()
        progress_monitor = tqdm.tqdm(total=ts.num_sites)
        for variant in ts.variants():
            sample_data.add_site(variant.site.position, variant.genotypes)
            progress_monitor.update()
        progress_monitor.close()

    print(sample_data)
Ejemplo n.º 45
0
def run_dump_macs(args):
    """
    Write a macs formatted file so we can import into pbwt.
    """
    tree_sequence = msprime.load(args.history_file)
    n = tree_sequence.get_sample_size()
    m = tree_sequence.get_num_loci()
    print("COMMAND:\tnot_macs {} {}".format(n, m))
    print("SEED:\tASEED")
    site = 0
    for tree in tree_sequence.trees():
        for position, node in tree.mutations():
            h = ['0' for _ in range(n)]
            for u in tree.leaves(node):
                h[u - 1] = '1'
            print(
                "SITE:", site, position / m, 0.0, "".join(h), sep="\t"
            )
            site += 1
Ejemplo n.º 46
0
def examine():
    ts = msprime.load("tmp__NOBACKUP__/bottleneck-example.hdf5")
    print("num_records = ", ts.get_num_records())
    non_binary_records = 0
    max_record_length = 0
    for r in ts.records():
        if len(r.children) > 2:
            non_binary_records +=1
            max_record_length = max(max_record_length, len(r.children))
    print("non_binary_records = ", non_binary_records)
    print("max_record_length = ", max_record_length)
    num_nodes = collections.Counter()
    num_trees = 0
    for t in ts.trees():
        num_nodes[len(list(t.nodes(t.get_root())))] += 1
        num_trees += 1
    print("num_trees = ", num_trees)
    for k, v in num_nodes.items():
        print(k, "->", v)
Ejemplo n.º 47
0
def ld_dev():
    # ts = msprime.simulate(100, recombination_rate=10, mutation_rate=5,
    #         random_seed=1)
    num_threads = 10
    ts = msprime.load(sys.argv[1])
    print("num trees = ", ts.get_num_trees())
    print("num mutations  = ", ts.get_num_mutations())
    # num_mutations = min(ts.get_num_mutations(), 100000)
    # num_mutations = ts.get_num_mutations()
    num_mutations = 1000
    ld_calcs = [
        _msprime.LdCalculator(ts._ll_tree_sequence) for _ in range(num_threads)]
    k = ts.get_num_trees() // num_threads
    start = 0
    next_block = k
    intervals = []
    for t in ts.trees():
        if t.get_index() >= next_block:
            mutations = list(t.mutations())
            if len(mutations) > 0:
                stop = mutations[-1].index
                intervals.append((start, stop))
                start = stop
                next_block += k

    threads = []
    lock = threading.Lock()
    progress = [0 for j in range(num_threads)]
    for j in range(num_threads):
        start, stop = intervals[j]
        t = threading.Thread(
            name="ld_worker_{}".format(j), target=ld_worker,
            args=(ld_calcs[j], start, stop, num_mutations, j, lock, progress))
        t.start()
        threads.append(t)

    print("Main thread joining")
    for t in threads:
        t.join()
    print("Main thread done")
Ejemplo n.º 48
0
def pop_example():
    if False:
        t = 100
        ts = msprime.simulate(
            Ne=10**4,
            population_configurations=[
                msprime.PopulationConfiguration(sample_size=1000),
                msprime.PopulationConfiguration(sample_size=1000),
                msprime.PopulationConfiguration(sample_size=1000),
                msprime.PopulationConfiguration(sample_size=1000),
                msprime.PopulationConfiguration(sample_size=1000)],
            demographic_events=[
                msprime.MassMigration(time=t, source=1, destination=0),
                msprime.MassMigration(time=t, source=2, destination=0),
                msprime.MassMigration(time=t, source=3, destination=0),
                msprime.MassMigration(time=t, source=4, destination=0)],
            length=100 * 1e6,
            recombination_rate=2e-8,
            mutation_rate=2e-8,
            random_seed=1)
        ts.dump("populations.hdf5")
        print(
            ts.get_sample_size(), ts.get_num_trees(),
            ts.get_num_mutations())
    else:
        ts = msprime.load("populations.hdf5")
        before = time.clock()
        R = 1
        for i in range(R):
            for j in range(5):
                samples = ts.get_samples(population_id=j)
                pi = ts.get_pairwise_diversity(samples)
                # pi2 = ts.get_pairwise_diversity2(samples)
                # print(j, pi, pi2, pi == pi2)
                # print(j, pi2)
        duration = time.clock() - before
        print("duration = ", duration, " per call = ", duration / (5 * R))
Ejemplo n.º 49
0
                assert tail is None
            else:
                x = head
                while x.next is not None:
                    x = x.next
                assert x == tail
                x = head.next
                while x is not None:
                    assert x.left < x.right
                    if x.next is not None:
                        assert x.right <= x.next.left
                        # We should also not have any squashable segments.
                        if x.right == x.next.left:
                            assert x.node != x.next.node
                    x = x.next


if __name__ == "__main__":
    # Simple CLI for running simplifier above.
    ts = msprime.load(sys.argv[1])
    samples = list(map(int, sys.argv[2:]))
    s = Simplifier(ts, samples)
    # s.print_state()
    tss, _ = s.simplify()
    tables = tss.dump_tables()
    print("Output:")
    print(tables.nodes)
    print(tables.edges)
    print(tables.sites)
    print(tables.mutations)
Ejemplo n.º 50
0
def run_dump_mutations(args):
    tree_sequence = msprime.load(args.history_file)
    tree_sequence.write_mutations(sys.stdout, args.header, args.precision)
Ejemplo n.º 51
0
def run_dump_vcf(args):
    tree_sequence = msprime.load(args.history_file)
    tree_sequence.write_vcf(sys.stdout, args.ploidy)
Ejemplo n.º 52
0
def run_dump_variants(args):
    tree_sequence = msprime.load(args.history_file)
    for variant in tree_sequence.variants(as_bytes=True):
        print(variant.position, end="\t")
        print("{}".format(variant.genotypes.decode()))
Ejemplo n.º 53
0
def run_dump_haplotypes(args):
    tree_sequence = msprime.load(args.history_file)
    for h in tree_sequence.haplotypes():
        print(h)
Ejemplo n.º 54
0
def run_dump_newick(args):
    tree_sequence = msprime.load(args.history_file)
    for l, ns in tree_sequence.newick_trees(args.precision):
        print(ns)
Ejemplo n.º 55
0
 def test_single_locus_example_recombination(self):
     from_ts = msprime.load("tests/data/SLiM/single-locus-example.trees")
     ts = self.finish_simulation(from_ts, recombination_rate=0.1, seed=1)
     self.verify_completed(from_ts, ts)
Ejemplo n.º 56
0
def run_dump_mutations(args):
    tree_sequence = msprime.load(args.history_file)
    if args.header:
        print("x", "u", sep="\t")
    for position, node in tree_sequence.mutations():
        print(position, node, sep="\t")
Ejemplo n.º 57
0
def run_dump_records(args):
    tree_sequence = msprime.load(args.history_file)
    if args.header:
        print("l", "r", "u", "c1", "c2", "t", sep="\t")
    for l, r, u, c, t in tree_sequence.records():
        print(l, r, u, c[0], c[1], t, sep="\t")
Ejemplo n.º 58
0
 def test_minimal_example_no_recombination(self):
     from_ts = msprime.load("tests/data/SLiM/minimal-example.trees")
     with self.assertRaises(_msprime.InputError):
         # Zero recombination rates result in an error as we can't
         # remap coordinates into the genetic map.
         self.finish_simulation(from_ts, recombination_rate=0, seed=1)
Ejemplo n.º 59
0
def dump_file(filename):
    tree_sequence = msprime.load(filename)
    for r in tree_sequence.records():
        print(r)