Example #1
0
    def test_init(self):
        self.assertRaises(TypeError, _tsinfer.AncestorBuilder)
        for bad_value in [None, "serf", [[], []], ["asdf"], {}]:
            with self.assertRaises(TypeError):
                _tsinfer.AncestorBuilder(num_samples=2, max_sites=bad_value)
                _tsinfer.AncestorBuilder(num_samples=bad_value, max_sites=2)

        for bad_num_samples in [0, 1]:
            with self.assertRaises(_tsinfer.LibraryError):
                _tsinfer.AncestorBuilder(num_samples=bad_num_samples,
                                         max_sites=0)
Example #2
0
 def test_add_too_many_sites(self):
     for max_sites in range(10):
         ab = _tsinfer.AncestorBuilder(num_samples=2, max_sites=max_sites)
         for j in range(max_sites):
             ab.add_site(time=1, genotypes=[0, 1])
         for j in range(2 * max_sites):
             with pytest.raises(_tsinfer.LibraryError):
                 ab.add_site(time=1, genotypes=[0, 1])
Example #3
0
 def test_add_site(self):
     ab = _tsinfer.AncestorBuilder(num_samples=2, max_sites=10)
     for bad_type in ["sdf", {}, None]:
         with pytest.raises(TypeError):
             ab.add_site(time=bad_type, genotypes=[0, 0])
     for bad_genotypes in ["asdf", [[], []], [0, 1, 2]]:
         with pytest.raises(ValueError):
             ab.add_site(time=0, genotypes=bad_genotypes)
Example #4
0
 def test_add_too_many_sites(self):
     for max_sites in range(10):
         ab = _tsinfer.AncestorBuilder(num_samples=2, max_sites=max_sites)
         for _ in range(max_sites):
             ab.add_site(time=1, genotypes=[0, 1])
         for _ in range(2 * max_sites):
             with pytest.raises(_tsinfer.LibraryError) as record:
                 ab.add_site(time=1, genotypes=[0, 1])
             msg = "Cannot add more sites than the specified maximum."
             assert str(record.value) == msg
Example #5
0
def build_ancestors(input_data, ancestor_data, progress=False, method="C"):

    num_sites = input_data.num_variant_sites
    num_samples = input_data.num_samples
    if method == "C":
        logger.debug("Using C AncestorBuilder implementation")
        ancestor_builder = _tsinfer.AncestorBuilder(num_samples, num_sites)
    else:
        logger.debug("Using Python AncestorBuilder implementation")
        ancestor_builder = algorithm.AncestorBuilder(num_samples, num_sites)

    progress_monitor = tqdm.tqdm(total=num_sites, disable=not progress)
    frequency = input_data.frequency[:]
    logger.info("Starting site addition")
    for j, (site_id, genotypes) in enumerate(input_data.variants()):
        ancestor_builder.add_site(j, int(frequency[site_id]), genotypes)
        progress_monitor.update()
    progress_monitor.close()
    logger.info("Finished adding sites")

    descriptors = ancestor_builder.ancestor_descriptors()
    if len(descriptors) > 0:
        num_ancestors = len(descriptors)
        logger.info("Starting build for {} ancestors".format(num_ancestors))
        a = np.zeros(num_sites, dtype=np.uint8)
        root_time = descriptors[0][0] + 1
        ultimate_ancestor_time = root_time + 1
        # Add the ultimate ancestor. This is an awkward hack really; we don't
        # ever insert this ancestor. The only reason to add it here is that
        # it makes sure that the ancestor IDs we have in the ancestor file are
        # the same as in the ancestor tree sequence. This seems worthwhile.
        ancestor_data.add_ancestor(start=0,
                                   end=num_sites,
                                   time=ultimate_ancestor_time,
                                   focal_sites=[],
                                   haplotype=a)
        # Hack to ensure we always have a root with zeros at every position.
        ancestor_data.add_ancestor(start=0,
                                   end=num_sites,
                                   time=root_time,
                                   focal_sites=np.array([], dtype=np.int32),
                                   haplotype=a)
        progress_monitor = tqdm.tqdm(total=len(descriptors),
                                     disable=not progress)
        for freq, focal_sites in descriptors:
            before = time.perf_counter()
            # TODO: This is a read-only process so we can multithread it.
            s, e = ancestor_builder.make_ancestor(focal_sites, a)
            assert np.all(a[s:e] != UNKNOWN_ALLELE)
            assert np.all(a[:s] == UNKNOWN_ALLELE)
            assert np.all(a[e:] == UNKNOWN_ALLELE)
            duration = time.perf_counter() - before
            logger.debug(
                "Made ancestor with {} focal sites and length={} in {:.2f}s.".
                format(focal_sites.shape[0], e - s, duration))
            ancestor_data.add_ancestor(start=s,
                                       end=e,
                                       time=freq,
                                       focal_sites=focal_sites,
                                       haplotype=a)
            progress_monitor.update()
        progress_monitor.close()
    logger.info("Finished building ancestors")