def test_init(self): self.assertRaises(TypeError, _tsinfer.AncestorBuilder) for bad_value in [None, "serf", [[], []], ["asdf"], {}]: with self.assertRaises(TypeError): _tsinfer.AncestorBuilder(num_samples=2, max_sites=bad_value) _tsinfer.AncestorBuilder(num_samples=bad_value, max_sites=2) for bad_num_samples in [0, 1]: with self.assertRaises(_tsinfer.LibraryError): _tsinfer.AncestorBuilder(num_samples=bad_num_samples, max_sites=0)
def test_add_too_many_sites(self): for max_sites in range(10): ab = _tsinfer.AncestorBuilder(num_samples=2, max_sites=max_sites) for j in range(max_sites): ab.add_site(time=1, genotypes=[0, 1]) for j in range(2 * max_sites): with pytest.raises(_tsinfer.LibraryError): ab.add_site(time=1, genotypes=[0, 1])
def test_add_site(self): ab = _tsinfer.AncestorBuilder(num_samples=2, max_sites=10) for bad_type in ["sdf", {}, None]: with pytest.raises(TypeError): ab.add_site(time=bad_type, genotypes=[0, 0]) for bad_genotypes in ["asdf", [[], []], [0, 1, 2]]: with pytest.raises(ValueError): ab.add_site(time=0, genotypes=bad_genotypes)
def test_add_too_many_sites(self): for max_sites in range(10): ab = _tsinfer.AncestorBuilder(num_samples=2, max_sites=max_sites) for _ in range(max_sites): ab.add_site(time=1, genotypes=[0, 1]) for _ in range(2 * max_sites): with pytest.raises(_tsinfer.LibraryError) as record: ab.add_site(time=1, genotypes=[0, 1]) msg = "Cannot add more sites than the specified maximum." assert str(record.value) == msg
def build_ancestors(input_data, ancestor_data, progress=False, method="C"): num_sites = input_data.num_variant_sites num_samples = input_data.num_samples if method == "C": logger.debug("Using C AncestorBuilder implementation") ancestor_builder = _tsinfer.AncestorBuilder(num_samples, num_sites) else: logger.debug("Using Python AncestorBuilder implementation") ancestor_builder = algorithm.AncestorBuilder(num_samples, num_sites) progress_monitor = tqdm.tqdm(total=num_sites, disable=not progress) frequency = input_data.frequency[:] logger.info("Starting site addition") for j, (site_id, genotypes) in enumerate(input_data.variants()): ancestor_builder.add_site(j, int(frequency[site_id]), genotypes) progress_monitor.update() progress_monitor.close() logger.info("Finished adding sites") descriptors = ancestor_builder.ancestor_descriptors() if len(descriptors) > 0: num_ancestors = len(descriptors) logger.info("Starting build for {} ancestors".format(num_ancestors)) a = np.zeros(num_sites, dtype=np.uint8) root_time = descriptors[0][0] + 1 ultimate_ancestor_time = root_time + 1 # Add the ultimate ancestor. This is an awkward hack really; we don't # ever insert this ancestor. The only reason to add it here is that # it makes sure that the ancestor IDs we have in the ancestor file are # the same as in the ancestor tree sequence. This seems worthwhile. ancestor_data.add_ancestor(start=0, end=num_sites, time=ultimate_ancestor_time, focal_sites=[], haplotype=a) # Hack to ensure we always have a root with zeros at every position. ancestor_data.add_ancestor(start=0, end=num_sites, time=root_time, focal_sites=np.array([], dtype=np.int32), haplotype=a) progress_monitor = tqdm.tqdm(total=len(descriptors), disable=not progress) for freq, focal_sites in descriptors: before = time.perf_counter() # TODO: This is a read-only process so we can multithread it. s, e = ancestor_builder.make_ancestor(focal_sites, a) assert np.all(a[s:e] != UNKNOWN_ALLELE) assert np.all(a[:s] == UNKNOWN_ALLELE) assert np.all(a[e:] == UNKNOWN_ALLELE) duration = time.perf_counter() - before logger.debug( "Made ancestor with {} focal sites and length={} in {:.2f}s.". format(focal_sites.shape[0], e - s, duration)) ancestor_data.add_ancestor(start=s, end=e, time=freq, focal_sites=focal_sites, haplotype=a) progress_monitor.update() progress_monitor.close() logger.info("Finished building ancestors")