Beispiel #1
0
 def setUp(self):
     self.tempdir = tempfile.TemporaryDirectory(prefix="tsinfer_cli_test")
     self.sample_file = str(
         pathlib.Path(self.tempdir.name, "input-data.samples"))
     self.ancestor_file = str(
         pathlib.Path(self.tempdir.name, "input-data.ancestors"))
     self.ancestor_trees = str(
         pathlib.Path(self.tempdir.name, "input-data.ancestors.trees"))
     self.output_trees = str(
         pathlib.Path(self.tempdir.name, "input-data.trees"))
     self.input_ts = msprime.simulate(10,
                                      mutation_rate=10,
                                      recombination_rate=10,
                                      random_seed=10)
     sample_data = tsinfer.SampleData(
         sequence_length=self.input_ts.sequence_length,
         path=self.sample_file)
     for var in self.input_ts.variants():
         sample_data.add_site(var.site.position, var.genotypes, var.alleles)
     sample_data.finalise()
     tsinfer.generate_ancestors(sample_data,
                                path=self.ancestor_file,
                                chunk_size=10)
     ancestor_data = tsinfer.load(self.ancestor_file)
     ancestors_ts = tsinfer.match_ancestors(sample_data, ancestor_data)
     ancestors_ts.dump(self.ancestor_trees)
     ts = tsinfer.match_samples(sample_data, ancestors_ts)
     ts.dump(self.output_trees)
     sample_data.close()
Beispiel #2
0
def run_generate_ancestors(args):
    setup_logging(args)
    ancestors_path = get_ancestors_path(args.ancestors, args.samples)
    sample_data = tsinfer.SampleData.load(args.samples)
    tsinfer.generate_ancestors(
        sample_data,
        progress_monitor=args.progress,
        path=ancestors_path,
        num_flush_threads=args.num_flush_threads,
        num_threads=args.num_threads,
    )
    summarise_usage()
Beispiel #3
0
 def test_ancestors_ts(self):
     ts = msprime.simulate(10, mutation_rate=1, random_seed=1)
     assert ts.num_sites > 1
     samples = tsinfer.SampleData.from_tree_sequence(ts)
     ancestor_data = tsinfer.generate_ancestors(samples)
     ancestors_ts = tsinfer.match_ancestors(samples, ancestor_data)
     self.validate_ts(ancestors_ts)
def generate_ancestors(samples_fn, num_threads, prefix):
    sample_data = tsinfer.load(samples_fn)
    anc = tsinfer.generate_ancestors(
        sample_data,
        num_threads=num_threads,
        path=prefix + ".ancestors",
        progress_monitor=True,
    )
    if np.any(sample_data.individuals_time[:] != 0):
        anc_w_proxy = anc.insert_proxy_samples(sample_data,
                                               allow_mutation=True)
        anc = anc_w_proxy.copy(path=prefix + ".proxy.ancestors")
        anc.finalise()
    maximum_time = np.max(anc.ancestors_time[:])
    if (maximum_time < 3
        ):  # hacky way of checking if we used frequency to order ancestors
        anc = anc.truncate_ancestors(0.4,
                                     0.6,
                                     length_multiplier=1,
                                     path=prefix + ".truncated.ancestors")
    else:
        upper_time_limit = maximum_time * 0.6
        lower_time_limit = maximum_time * 0.4
        anc = anc.truncate_ancestors(
            lower_time_limit,
            upper_time_limit,
            length_multiplier=1,
            path=prefix + ".truncated.ancestors",
        )
    return anc
Beispiel #5
0
def visualise(ts,
              recombination_rate,
              error_rate,
              engine="C",
              box_size=8,
              perfect_ancestors=False,
              path_compression=False,
              time_chunking=False):

    sample_data = tsinfer.SampleData.from_tree_sequence(ts)

    if perfect_ancestors:
        ancestor_data = tsinfer.AncestorData(sample_data)
        tsinfer.build_simulated_ancestors(sample_data,
                                          ancestor_data,
                                          ts,
                                          time_chunking=time_chunking)
        ancestor_data.finalise()
    else:
        ancestor_data = tsinfer.generate_ancestors(sample_data, engine=engine)

    ancestors_ts = tsinfer.match_ancestors(sample_data,
                                           ancestor_data,
                                           engine=engine,
                                           path_compression=path_compression,
                                           extended_checks=True)
    inferred_ts = tsinfer.match_samples(sample_data,
                                        ancestors_ts,
                                        engine=engine,
                                        simplify=False,
                                        path_compression=path_compression,
                                        extended_checks=True)

    prefix = "tmp__NOBACKUP__/"
    visualiser = Visualiser(ts,
                            sample_data,
                            ancestor_data,
                            inferred_ts,
                            box_size=box_size)
    visualiser.draw_copying_paths(os.path.join(prefix, "copying_{}.png"))

    # tsinfer.print_tree_pairs(ts, inferred_ts, compute_distances=False)
    inferred_ts = tsinfer.match_samples(sample_data,
                                        ancestors_ts,
                                        engine=engine,
                                        simplify=True,
                                        path_compression=False,
                                        stabilise_node_ordering=True)

    tsinfer.print_tree_pairs(ts, inferred_ts, compute_distances=True)
    sys.stdout.flush()
    print("num_sites = ", inferred_ts.num_sites, "num_mutations= ",
          inferred_ts.num_mutations)

    for site in inferred_ts.sites():
        if len(site.mutations) > 1:
            print("Multiple mutations at ", site.id, "over",
                  [mut.node for mut in site.mutations])
Beispiel #6
0
def visualise_ancestors():
    ts = msprime.simulate(10,
                          mutation_rate=2,
                          recombination_rate=2,
                          random_seed=3)
    ts = tsinfer.strip_singletons(ts)
    sample_data = tsinfer.SampleData.from_tree_sequence(ts)
    ancestor_data = tsinfer.generate_ancestors(sample_data)
    viz = AncestorBuilderViz(sample_data, ancestor_data)

    viz.draw(6, "ancestors_{}.svg")
def infer_with_mismatch(
    sample_data,
    path_to_genetic_map,
    ma_mismatch=1,
    ms_mismatch=1,
    precision=15,
    num_threads=1,
    path_compression=True,
    progress_monitor=False,
):
    ancestors = tsinfer.generate_ancestors(
        sample_data, num_threads=num_threads, progress_monitor=progress_monitor
    )
    gmap = msprime.RateMap.read_hapmap(
        path_to_genetic_map, sequence_length=ancestors.sequence_length
    )
    genetic_dists = tsinfer.Matcher.recombination_rate_to_dist(
        gmap, ancestors.sites_position[:]
    )
    recombination = tsinfer.Matcher.recombination_dist_to_prob(genetic_dists)
    recombination[recombination == 0] = 1e-20
    mismatch = np.full(
        len(ancestors.sites_position[:]),
        tsinfer.Matcher.mismatch_ratio_to_prob(1, np.median(genetic_dists), 2),
    )

    ancestors_ts = tsinfer.match_ancestors(
        sample_data,
        ancestors,
        recombination=recombination,
        mismatch=mismatch,
        precision=precision,
        num_threads=num_threads,
        path_compression=path_compression,
        progress_monitor=progress_monitor,
    )
    return tsinfer.match_samples(
        sample_data,
        ancestors_ts,
        recombination=recombination,
        mismatch=mismatch,
        precision=precision,
        num_threads=num_threads,
        path_compression=path_compression,
        progress_monitor=progress_monitor,
    )
def setup_sample_file(base_filename, args, num_threads=1):
    """
    Return a sample data file, the ancestors file, a corresponding recombination rate
    (a single number or a RateMap), a prefix to use for files, and None
    """
    gmap = args.genetic_map
    sd = tsinfer.load(base_filename + ".samples")

    anc = tsinfer.generate_ancestors(
        sd,
        num_threads=num_threads,
        path=base_filename + ".ancestors",
    )
    logger.info("GA done")

    inference_pos = anc.sites_position[:]

    match = re.search(r'(chr\d+)', base_filename)
    if match or gmap is not None:
        if gmap is not None:
            logger.info(f"Using {gmap} for the recombination map")
            rho = intervals.read_hapmap(gmap)
        else:
            chr = match.group(1)
            logger.info(
                f"Using {chr} from HapMapII_GRCh37 for the recombination map")
            gmap = stdpopsim.get_species("HomSap").get_genetic_map(
                id="HapMapII_GRCh37")
            if not gmap.is_cached():
                gmap.download()
            filename = os.path.join(gmap.map_cache_dir,
                                    gmap.file_pattern.format(id=chr))
            rho = intervals.read_hapmap(filename)
    else:
        rho = 1e-8  # shouldn't matter what this is - it it relative to mismatch

    #if np.any(d==0):
    #    w = np.where(d==0)
    #    raise ValueError("Zero recombination rates at", w, inference_pos[w])

    return sd.path, anc.path, rho, "", None
def infer_with_mismatch(sample_data,
                        chromosome,
                        ma_mismatch=0.1,
                        ms_mismatch=0.1,
                        precision=None,
                        modern_samples_match=False,
                        ancient_ancestors=False,
                        num_threads=1):
    ancestors = tsinfer.generate_ancestors(sample_data,
                                           num_threads=num_threads)
    genetic_map = run_inference.get_genetic_map(chromosome)
    rho, ma_mis, ms_mis, precision = run_inference.get_rho(
        sample_data,
        ancestors,
        genetic_map,
        ma_mismatch,
        ms_mismatch,
        precision=None,
        num_threads=num_threads)
    rho[:-1][rho[:-1] == 0] = np.min(rho[:-1][rho[:-1] != 0]) / 100
    ancestors_ts = run_inference.match_ancestors(
        sample_data,
        ancestors,
        rho,
        ma_mis,
        precision=13,
        ancient_ancestors=ancient_ancestors,
        num_threads=num_threads)
    return run_inference.match_samples(
        sample_data,
        ancestors_ts,
        rho,
        ms_mis,
        13,
        modern_samples_match=modern_samples_match,
        ancient_ancestors=ancient_ancestors,
        num_threads=num_threads)
Beispiel #10
0
    ancient_samples.add_individual(ploidy=1, metadata={})

for v in ts.variants():
    #    if np.random.randint(4) != 1:
    ancient_samples.add_site(position=v.site.position,
                             alleles=v.alleles,
                             genotypes=v.genotypes[ancient_sample_indices])
ancient_samples.finalise()

#%%
# Infer and date tree from modern samples

primary_ts = ts.simplify(modern_sample_indices, filter_sites=False)
primary_samples = tsinfer.SampleData.from_tree_sequence(primary_ts)

ancestors = tsinfer.generate_ancestors(primary_samples)
ancestors_ts = tsinfer.match_ancestors(
    primary_samples, ancestors)  # This only has inference sites

primary_inferred_ts = tsinfer.match_samples(primary_samples,
                                            ancestors_ts,
                                            simplify=False)
primary_inferred_ts_simplified = primary_inferred_ts.simplify(
    np.where(primary_inferred_ts.tables.nodes.flags == 1)[0], keep_unary=True)

tsdate.date(primary_inferred_ts_simplified,
            Ne=stable_pop_size,
            mutation_rate=2.5e-5)

#%%
# rest of inference- augmenting older samples in
Beispiel #11
0
def tsinfer_dev(n,
                L,
                seed,
                num_threads=1,
                recombination_rate=1e-8,
                error_rate=0,
                engine="C",
                log_level="WARNING",
                debug=True,
                progress=False,
                path_compression=True):

    np.random.seed(seed)
    random.seed(seed)
    L_megabases = int(L * 10**6)

    # daiquiri.setup(level=log_level)

    ts = msprime.simulate(n,
                          Ne=10**4,
                          length=L_megabases,
                          recombination_rate=recombination_rate,
                          mutation_rate=1e-8,
                          random_seed=seed)
    if debug:
        print("num_sites = ", ts.num_sites)
    assert ts.num_sites > 0

    samples = tsinfer.SampleData.from_tree_sequence(ts)

    ancestor_data = tsinfer.generate_ancestors(samples,
                                               engine=engine,
                                               num_threads=num_threads)
    ancestors_ts = tsinfer.match_ancestors(samples,
                                           ancestor_data,
                                           engine=engine,
                                           path_compression=True,
                                           extended_checks=False)

    ancestors_ts = tsinfer.augment_ancestors(samples,
                                             ancestors_ts, [5, 6, 7],
                                             engine=engine)

    ts = tsinfer.match_samples(samples,
                               ancestors_ts,
                               path_compression=False,
                               engine=engine,
                               simplify=True)

    # print(ts.tables.edges)
    # print(ts.dump_tables())

    # simplified = ts.simplify()
    # print("edges before = ", simplified.num_edges)

    # new_ancestors_ts = insert_srb_ancestors(ts)
    # ts = tsinfer.match_samples(samples, new_ancestors_ts,
    #         path_compression=False, engine=engine,
    #         simplify=True)

    #     for tree in ts.trees():
    #         print(tree.interval)
    #         print(tree.draw(format="unicode"))

    # print(ts.tables.edges)
    # for tree in ts.trees():
    #     print(tree.draw(format="unicode"))

    tsinfer.verify(samples, ts)
Beispiel #12
0
def run(params):
    """
    Run a single inference, with the specified rates
    """

    prefix = None
    if params.sample_data.path is not None:
        assert params.sample_data.path.endswith(".samples")
        prefix = params.sample_data.path[0:-len(".samples")]
    start_time = time.process_time()
    ga_start_time = time.process_time()
    if os.path.isfile(prefix + ".ancestors") == False:
        anc = tsinfer.generate_ancestors(
            params.sample_data,
            num_threads=params.num_threads,
            path=prefix + ".ancestors",
            progress_monitor=tsinfer.cli.ProgressMonitor(1, 1, 0, 0, 0),
        )
        print(
            f"GA done (ma_mut: {params.ma_mut_rate}, ms_mut: {params.ms_mut_rate})"
        )
    else:
        anc = tsinfer.load(prefix + ".ancestors")
    ga_process_time = time.process_time() - ga_start_time

    anc_w_proxy = anc.insert_proxy_samples(params.sample_data,
                                           allow_mutation=True)
    # If any proxy ancestors were added, save the proxy ancestors file and use for matching
    if anc_w_proxy.num_ancestors != anc.num_ancestors:
        anc = anc_w_proxy.copy(path=prefix + ".proxy.ancestors")
        anc.finalise()
        path_compression = False
    else:
        path_compression = True

    rec_rate = get_rho(anc, params.filename)
    rho = rec_rate[1:]
    base_rec_prob = np.quantile(rho, 0.5)
    if params.precision is None:
        # Smallest recombination rate
        min_rho = int(np.ceil(-np.min(np.log10(rho))))
        # Smallest mean
        av_min = int(
            np.ceil(-np.log10(
                min(1, params.ma_mut_rate, params.ms_mut_rate) *
                base_rec_prob)))
        precision = max(min_rho, av_min) + 3
    else:
        precision = params.precision
    print(
        f"Starting {params.ma_mut_rate} {params.ms_mut_rate}",
        f"with base rho {base_rec_prob:.5g}",
        f"(mean {np.mean(rho):.4g} median {np.quantile(rho, 0.5):.4g}",
        f"min {np.min(rho):.4g}, 2.5% quantile {np.quantile(rho, 0.025):.4g})",
        f"precision {precision}")
    ma_start_time = time.process_time()
    if os.path.isfile(prefix + ".atrees") == False:
        inferred_anc_ts = tsinfer.match_ancestors(
            params.sample_data,
            anc,
            num_threads=params.num_threads,
            precision=precision,
            recombination_rate=rec_rate,
            mismatch_rate=base_rec_prob * params.ma_mut_rate,
            path_compression=path_compression,
            progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 1, 0, 0),
        )
        inferred_anc_ts.dump(path=prefix + ".atrees")
        print(
            f"MA done (ma_mut:{params.ma_mut_rate} ms_mut{params.ms_mut_rate})"
        )
    else:
        inferred_anc_ts = tskit.load(prefix + ".atrees")
    ma_process_time = time.process_time() - ma_start_time

    ms_start_time = time.process_time()
    if os.path.isfile(prefix + ".trees") == False:
        inferred_ts = tsinfer.match_samples(
            params.sample_data,
            inferred_anc_ts,
            num_threads=params.num_threads,
            precision=precision,
            recombination_rate=rec_rate,
            mismatch_rate=base_rec_prob * params.ms_mut_rate,
            progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 0, 0, 1),
            force_sample_times=True,
            simplify=False)
        print(f"MS done: ms_mut rate = {params.ms_mut_rate})")
        process_time = time.process_time() - start_time
        ms_process_time = time.process_time() - ms_start_time
        ts_path = prefix + ".nosimplify.trees"
        inferred_ts.dump(path=ts_path)
    else:
        raise ValueError("Inferred tree sequence already present")

    return Results(ma_mut=params.ma_mut_rate,
                   ms_mut=params.ms_mut_rate,
                   precision=precision,
                   edges=inferred_ts.num_edges,
                   muts=inferred_ts.num_mutations,
                   num_trees=inferred_ts.num_trees,
                   process_time=process_time,
                   ga_process_time=ga_process_time,
                   ma_process_time=ma_process_time,
                   ms_process_time=ms_process_time,
                   ts_size=os.path.getsize(ts_path),
                   ts_path=ts_path)
Beispiel #13
0
def run_build():

    sample_data = tsinfer.load(sys.argv[1])
    ad = tsinfer.generate_ancestors(sample_data)
    print(ad)
Beispiel #14
0
def tsinfer_dev(
    n,
    L,
    seed,
    num_threads=1,
    recombination_rate=1e-8,
    error_rate=0,
    engine="C",
    log_level="WARNING",
    precision=None,
    debug=True,
    progress=False,
    path_compression=True,
):

    np.random.seed(seed)
    random.seed(seed)
    L_megabases = int(L * 10**6)

    # daiquiri.setup(level=log_level)

    ts = msprime.simulate(
        n,
        Ne=10**4,
        length=L_megabases,
        recombination_rate=recombination_rate,
        mutation_rate=1e-8,
        random_seed=seed,
    )
    if debug:
        print("num_sites = ", ts.num_sites)
    assert ts.num_sites > 0

    # ts = msprime.mutate(ts, rate=1e-8, random_seed=seed,
    #         model=msprime.InfiniteSites(msprime.NUCLEOTIDES))

    samples = tsinfer.SampleData.from_tree_sequence(ts)
    rho = recombination_rate
    mu = 1e-3  # 1e-15

    #     num_alleles = samples.num_alleles(inference_sites=True)
    #     num_sites = samples.num_inference_sites
    #     with tsinfer.AncestorData(samples) as ancestor_data:
    #         t = np.sum(num_alleles) + 1
    #         for j in range(num_sites):
    #             for allele in range(num_alleles[j]):
    #                 ancestor_data.add_ancestor(j, j + 1, t, [j], [allele])
    #                 t -= 1

    ancestor_data = tsinfer.generate_ancestors(samples,
                                               engine=engine,
                                               num_threads=num_threads)

    ancestors_ts = tsinfer.match_ancestors(
        samples,
        ancestor_data,
        engine=engine,
        path_compression=True,
        extended_checks=False,
        precision=precision,
        recombination_rate=rho,
        mutation_rate=mu,
    )
    # print(ancestors_ts.tables)
    # print("ancestors ts")
    # for tree in ancestors_ts.trees():
    #     print(tree.draw_text())
    #     for site in tree.sites():
    #         if len(site.mutations) > 1:
    #             print(site.id)
    #             for mutation in site.mutations:
    #                 print("\t", mutation.node, mutation.derived_state)

    # for var in ancestors_ts.variants():
    #     print(var.genotypes)

    # print(ancestors_ts.tables)

    # ancestors_ts = tsinfer.augment_ancestors(samples, ancestors_ts,
    #         [5, 6, 7], engine=engine)

    ts = tsinfer.match_samples(
        samples,
        ancestors_ts,
        recombination_rate=rho,
        mutation_rate=mu,
        path_compression=False,
        engine=engine,
        precision=precision,
        simplify=False,
    )

    print("num_edges = ", ts.num_edges)

    # # print(ts.draw_text())
    # for tree in ts.trees():
    #     print(tree.draw_text())
    #     for site in tree.sites():
    #         if len(site.mutations) > 1:
    #             print(site.id)
    #             for mutation in site.mutations:
    #                 print("\t", mutation.node, mutation.derived_state)

    # # print(ts.tables.edges)
    # print(ts.dump_tables())

    # simplified = ts.simplify()
    # print("edges before = ", simplified.num_edges)

    # new_ancestors_ts = insert_srb_ancestors(ts)
    # ts = tsinfer.match_samples(samples, new_ancestors_ts,
    #         path_compression=False, engine=engine,
    #         simplify=True)

    #     for tree in ts.trees():
    #         print(tree.interval)
    #         print(tree.draw(format="unicode"))

    # print(ts.tables.edges)
    # for tree in ts.trees():
    #     print(tree.draw(format="unicode"))

    tsinfer.verify(samples, ts)
Beispiel #15
0
 def test_ancestors_ts(self, small_sd_fixture):
     ancestor_data = tsinfer.generate_ancestors(small_sd_fixture)
     ancestors_ts = tsinfer.match_ancestors(small_sd_fixture, ancestor_data)
     self.validate_ts(ancestors_ts)
Beispiel #16
0
def run(params):
    """
    Run a single inference, with the specified rates
    """
    rho = params.rec_rate[1:]
    base_rec_prob = np.quantile(rho, 0.5)
    ma_mis_rate = ms_mis_rate = 1.0
    if params.precision is None:
        # Smallest recombination rate
        min_rho = int(np.ceil(-np.min(np.log10(rho))))
        # Smallest mean
        av_min = int(
            np.ceil(
                -np.log10(min(1, ma_mis_rate, ms_mis_rate) * base_rec_prob)))
        precision = max(min_rho, av_min) + 3
    else:
        precision = params.precision
    ma_mis = base_rec_prob * ma_mis_rate
    ms_mis = base_rec_prob * ms_mis_rate
    print(
        f"Starting {params.cutoff_power}, trim_oldest={params.trim_oldest}",
        f"with base rho {base_rec_prob:.5g}",
        f"(mean {np.mean(rho):.4g} median {np.quantile(rho, 0.5):.4g}",
        f"min {np.min(rho):.4g}, 2.5% quantile {np.quantile(rho, 0.025):.4g})",
        f"precision {precision}")
    prefix = None
    if params.sample_data.path is not None:
        assert params.sample_data.path.endswith(".samples")
        prefix = params.sample_data.path[0:-len(".samples")]
        inf_prefix = "{}_rma{}_rms{}_N{}_{}_p{}".format(
            prefix, ma_mis_rate, ms_mis_rate, params.cutoff_power,
            "trim" if params.trim_oldest else "norm", precision)
    start_time = time.process_time()
    anc = tsinfer.generate_ancestors(
        params.sample_data,
        cutoff_power=params.cutoff_power,
        trim_oldest=params.trim_oldest,
        num_threads=params.num_threads,
        path=None if inf_prefix is None else inf_prefix + ".ancestors",
    )
    print(f"GA done (rel_ma_mis:{ma_mis_rate}, rel_ms_mis:{ms_mis_rate})")
    inferred_anc_ts = tsinfer.match_ancestors(
        params.sample_data,
        anc,
        num_threads=params.num_threads,
        precision=precision,
        recombination_rate=params.rec_rate,
        mismatch_rate=ma_mis,
    )
    inferred_anc_ts.dump(path=inf_prefix + ".atrees")
    print(f"MA done: abs_ma_mis rate = {ma_mis}")
    inferred_ts = tsinfer.match_samples(params.sample_data,
                                        inferred_anc_ts,
                                        num_threads=params.num_threads,
                                        precision=precision,
                                        recombination_rate=params.rec_rate,
                                        mismatch_rate=ms_mis)
    process_time = time.process_time() - start_time
    ts_path = inf_prefix + ".trees"
    inferred_ts.dump(path=ts_path)
    print(f"MS done: abs_ms_mis rate = {ms_mis}")
    simplified_inferred_ts = inferred_ts.simplify()  # Remove unary nodes
    # Calculate mean num children (polytomy-measure) for internal nodes
    nc_sum = 0
    nc_sum_sq = 0
    nc_tot = 0
    root_lengths = collections.defaultdict(float)
    for tree in simplified_inferred_ts.trees():
        for n in tree.nodes():
            n_children = tree.num_children(n)
            if n_children > 0:  # exclude leaves/samples
                nc_sum += n_children * tree.span
                nc_sum_sq += (n_children**2) * tree.span
                nc_tot += tree.span
    nc_mean = nc_sum / nc_tot
    nc_var = nc_sum_sq / nc_tot - (nc_mean**2
                                   )  # can't be bothered to adjust for n

    # Calculate span of root nodes in simplified tree

    # Calculate KC
    try:
        kc = simplified_inferred_ts.kc_distance(tskit.load(prefix + ".trees"))
    except FileNotFoundError:
        kc = None
    return Results(abs_ma_mis=ma_mis,
                   abs_ms_mis=ms_mis,
                   rel_ma_mis=ma_mis_rate,
                   rel_ms_mis=ms_mis_rate,
                   cutoff_power=params.cutoff_power,
                   trim_oldest=params.trim_oldest,
                   precision=precision,
                   edges=inferred_ts.num_edges,
                   muts=inferred_ts.num_mutations,
                   num_trees=inferred_ts.num_trees,
                   kc=kc,
                   mean_node_children=nc_mean,
                   var_node_children=nc_var,
                   process_time=process_time,
                   ts_size=os.path.getsize(ts_path),
                   ts_path=ts_path)
Beispiel #17
0
 def test_ancestors_file(self, small_sd_fixture):
     ancestor_data = tsinfer.generate_ancestors(small_sd_fixture)
     self.validate_file(ancestor_data)
def setup_sampledata_from_simulation(prefix,
                                     random_seed,
                                     err=0,
                                     num_threads=1,
                                     cheat_breakpoints=False,
                                     use_sites_time=False,
                                     skip_existing=False):
    """
    Take the results of a simulation and return a sample data file, some reconstructed
    ancestors, a recombination rate array, a suffix to append to the file prefix, and
    the original tree sequence.
    
    If 'err' is 0, we do not inject any errors into the haplotypes. Otherwise
    we add empirical sequencing error and ancestral allele polarity error
    
    If "cheat_recombination" is True, multiply the recombination_rate for known
    recombination locations from the simulation by 20

    If "use_sites_time" is True, use the times
    
    If "skip_existing" is True, and the sample_data file and ancestors_file that were
    going to be generated already exist, then skip the actual simulation and just return
    those files and their data.
    """
    suffix = ""
    ts = tskit.load(prefix + ".trees")
    plain_samples = tsinfer.SampleData.from_tree_sequence(
        ts, use_sites_time=use_sites_time)
    if cheat_breakpoints:
        suffix += "cheat_breakpoints"
        logger.info("Cheating by using known breakpoints")
    if use_sites_time:
        suffix += "use_times"
        logger.info("Cheating by using known times")
    if err == 0:
        sd_path = prefix + suffix + ".samples"
        if skip_existing and os.path.exists(sd_path):
            logger.info(
                f"Simulation file {sd_path} already exists, loading that.")
            sd = tsinfer.load(sd_path)
        else:
            sd = plain_samples.copy(path=sd_path)  # Save the samples file
            sd.finalise()
    else:
        logger.info("Adding error")
        suffix += f"_ae{err}"
        sd_path = prefix + suffix + ".samples"
        if skip_existing and os.path.exists(sd_path):
            logger.info(f"Sample file {sd_path} already exists, loading that.")
            sd = tsinfer.load(sd_path)
        else:
            error_file = add_errors(plain_samples,
                                    err,
                                    random_seed=random_seed)
            sd = error_file.copy(path=prefix + suffix + ".samples")
            if use_sites_time:
                # Sites that were originally singletons have time 0, but could have been
                # converted to inference sites when adding error. Give these a nonzero time
                sites_time = sd.sites_time
                sites_time[sites_time == 0] = np.min(
                    sites_time[sites_time > 0]) / 1000.0
                sd.sites_time[:] = sites_time
            sd.finalise()
    for attribute in ('sequence_length', 'num_samples', 'num_sites'):
        if getattr(sd, attribute) != getattr(ts, attribute):
            raise ValueError(
                f"{attribute} differs between original ts and sample_data: "
                f"{getattr(sd, attribute)} vs {getattr(ts, attribute)}")

    anc_path = prefix + suffix + ".ancestors"
    if skip_existing and os.path.exists(anc_path):
        logger.info(f"Ancestors file {anc_path} already exists, loading that.")
        anc = tsinfer.load(anc_path)
    else:
        anc = tsinfer.generate_ancestors(
            sd,
            num_threads=num_threads,
            path=anc_path,
        )
        logger.info("GA done")

    inference_pos = anc.sites_position[:]

    rho = 1e-8  # shouldn't matter what this is - it it relative to mismatch
    if cheat_breakpoints:
        raise NotImplementedError(
            "Need to make a RateMap with higher r at breakpoints")
        breakpoint_positions = np.array(list(ts.breakpoints()))
        inference_positions = anc.sites_position[:]
        breakpoints = np.searchsorted(inference_positions,
                                      breakpoint_positions)
        # Any after the last inference position must be junked
        # (those before the first inference position make no difference)
        breakpoints = breakpoints[breakpoints != len(rho)]
        rho[breakpoints] *= 20
    return sd.path, anc.path, rho, suffix, ts
def run(params):
    """
    Run a single inference, with the specified rates
    """
    rho = params.rec_rate
    av_rho = np.quantile(rho, 0.5)
    ma_mis = av_rho * params.ma_mis_rate
    ms_mis = av_rho * params.ms_mis_rate

    if params.precision is None:
        # Smallest nonzero recombination rate
        min_rho = int(np.ceil(-np.min(np.log10(rho[rho > 0]))))
        # Smallest mean
        av_min = int(np.ceil(-np.log10(min(ma_mis, ms_mis))))
        precision = max(min_rho, av_min) + 3
    else:
        precision = params.precision

    print(
        f"Starting {params.ma_mis_rate} {params.ms_mis_rate}",
        f"with av rho {av_rho:.5g}",
        f"(mean {np.mean(rho):.4g}, median {np.quantile(rho, 0.5):.4g}, ",
        f"nonzero min {np.min(rho[rho > 0]):.4g}, ",
        f"2.5% quantile {np.quantile(rho, 0.025):.4g}) precision {precision}")
    prefix = None
    if params.sample_data.path is not None:
        assert params.sample_data.path.endswith(".samples")
        prefix = params.sample_data.path[0:-len(".samples")]
        inf_prefix = "{}_ma{}_ms{}_N{}_p{}".format(prefix, params.ma_mis_rate,
                                                   params.ms_mis_rate,
                                                   params.cutoff_exponent,
                                                   precision)
    start_time = time.process_time()
    extra_params = dict(num_threads=params.num_threads)
    if params.cutoff_exponent is not None:
        extra_params['cutoff_power'] = params.cutoff_exponent
    anc = tsinfer.generate_ancestors(
        params.sample_data,
        path=None if inf_prefix is None else inf_prefix + ".ancestors",
        progress_monitor=tsinfer.cli.ProgressMonitor(1, 1, 0, 0, 0),
        **extra_params,
    )
    print(f"GA done (cutoff exponent: {params.cutoff_exponent}")
    extra_params = dict(
        num_threads=params.num_threads,
        recombination_rate=rho,
        precision=precision,
    )
    inferred_anc_ts = tsinfer.match_ancestors(
        params.sample_data,
        anc,
        mismatch_rate=ma_mis,
        progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 1, 0, 0),
        **extra_params,
    )
    inferred_anc_ts.dump(path=inf_prefix + ".atrees")
    print(f"MA done (ma_mis:{ma_mis}")
    inferred_ts = tsinfer.match_samples(
        params.sample_data,
        inferred_anc_ts,
        mismatch_rate=ms_mis,
        progress_monitor=tsinfer.cli.ProgressMonitor(1, 0, 0, 0, 1),
        **extra_params,
    )
    process_time = time.process_time() - start_time
    ts_path = inf_prefix + ".trees"
    inferred_ts.dump(path=ts_path)
    print(f"MS done: ms_mis rate = {ms_mis})")
    simplified_inferred_ts = inferred_ts.simplify()  # Remove unary nodes
    # Calculate mean num children (polytomy-measure) for internal nodes
    nc_sum = 0
    nc_sum_sq = 0
    nc_tot = 0
    root_lengths = collections.defaultdict(float)
    for tree in simplified_inferred_ts.trees():
        for n in tree.nodes():
            n_children = tree.num_children(n)
            if n_children > 0:  # exclude leaves/samples
                nc_sum += n_children * tree.span
                nc_sum_sq += (n_children**2) * tree.span
                nc_tot += tree.span
    nc_mean = nc_sum / nc_tot
    nc_var = nc_sum_sq / nc_tot - (nc_mean**2
                                   )  # can't be bothered to adjust for n

    # Calculate span of root nodes in simplified tree

    # Calculate KC
    try:
        kc = simplified_inferred_ts.kc_distance(tskit.load(prefix + ".trees"))
    except FileNotFoundError:
        kc = None
    return Results(abs_ma_mis=ma_mis,
                   abs_ms_mis=ms_mis,
                   rel_ma_mis=params.ma_mis_rate,
                   rel_ms_mis=params.ms_mis_rate,
                   precision=precision,
                   edges=inferred_ts.num_edges,
                   muts=inferred_ts.num_mutations,
                   num_trees=inferred_ts.num_trees,
                   kc=kc,
                   cutoff_exponent=params.cutoff_exponent,
                   mean_node_children=nc_mean,
                   var_node_children=nc_var,
                   process_time=process_time,
                   ts_size=os.path.getsize(ts_path),
                   ts_path=ts_path)
Beispiel #20
0
 def test_ancestors_file(self):
     ts = msprime.simulate(10, mutation_rate=1, random_seed=1)
     self.assertGreater(ts.num_sites, 1)
     sample_data = tsinfer.SampleData.from_tree_sequence(ts)
     ancestor_data = tsinfer.generate_ancestors(sample_data)
     self.validate_file(ancestor_data)
Beispiel #21
0
def small_sd_anc_fixture(small_ts_fixture):
    """
    A sample data and an ancestors instance from the small 1-tree sequence
    """
    sd = tsinfer.SampleData.from_tree_sequence(small_ts_fixture, use_sites_time=False)
    return sd, tsinfer.generate_ancestors(sd)
                        help="'C' or 'P' for the c engine or python engine")
    args = parser.parse_args()

    if args.engine == "C":
        engine = tsinfer.C_ENGINE
    elif args.engine == "P":
        engine = tsinfer.PY_ENGINE
    else:
        raise ValueError

    sd = tsinfer.load(args.input_file)

    anc = tsinfer.generate_ancestors(
        sd,
        path=args.output_file,
        engine=engine,
        num_threads=args.num_threads,
        progress_monitor=tsinfer.cli.ProgressMonitor(enabled=True,
                                                     generate_ancestors=True),
    )

    full_len = np.logical_and(anc.ancestors_start[:][2:] == 0,
                              anc.ancestors_end[:][2:] == anc.num_sites)

    u, cnts = np.unique(anc.ancestors_time[:][2:][full_len],
                        return_counts=True)

    print("{}/{} full length ancestors at {} unique times ({} single)".format(
        np.sum(full_len),
        len(full_len),
        len(u),
        np.sum(cnts == 1),