Exemple #1
0
 def test_json(self):
     ts = msprime.simulate(10, random_seed=1)
     tables = ts.dump_tables()
     nodes = tables.nodes
     # For each node, we create some Python metadata that can be JSON encoded.
     metadata = [{
         "one": j,
         "two": 2 * j,
         "three": list(range(j))
     } for j in range(len(nodes))]
     encoded, offset = msprime.pack_strings(map(json.dumps, metadata))
     nodes.set_columns(flags=nodes.flags,
                       time=nodes.time,
                       population=nodes.population,
                       metadata_offset=offset,
                       metadata=encoded)
     self.assertTrue(np.array_equal(nodes.metadata_offset, offset))
     self.assertTrue(np.array_equal(nodes.metadata, encoded))
     ts1 = msprime.load_tables(nodes=nodes, edges=tables.edges)
     for j, node in enumerate(ts1.nodes()):
         decoded_metadata = json.loads(node.metadata.decode())
         self.assertEqual(decoded_metadata, metadata[j])
     ts1.dump(self.temp_file)
     ts2 = msprime.load(self.temp_file)
     self.assertEqual(ts1.tables.nodes, ts2.tables.nodes)
Exemple #2
0
 def test_simple_case(self):
     strings = ["hello", "world"]
     packed, length = msprime.pack_strings(strings)
     self.assertEqual(list(length), [5, 5])
     self.assertEqual(packed.shape, (10, ))
     returned = msprime.unpack_strings(packed, length)
     self.assertEqual(returned, strings)
Exemple #3
0
 def verify_packing(self, strings):
     packed, length = msprime.pack_strings(strings)
     self.assertEqual(packed.dtype, np.int8)
     self.assertEqual(length.dtype, np.uint32)
     self.assertEqual(list(length), [len(s) for s in strings])
     self.assertEqual(packed.shape[0], np.sum(length))
     returned = msprime.unpack_strings(packed, length)
     self.assertEqual(strings, returned)
Exemple #4
0
def node_metadata_example():
    ts = msprime.simulate(
        sample_size=100, recombination_rate=0.1, length=10, random_seed=1)
    nodes = msprime.NodeTable()
    edges = msprime.EdgeTable()
    ts.dump_tables(nodes=nodes, edges=edges)
    new_nodes = msprime.NodeTable()
    metadatas = ["n_{}".format(u) for u in range(ts.num_nodes)]
    packed, offset = msprime.pack_strings(metadatas)
    new_nodes.set_columns(
        metadata=packed, metadata_offset=offset, flags=nodes.flags, time=nodes.time)
    return msprime.load_tables(nodes=new_nodes, edges=edges)
Exemple #5
0
 def test_optional_population(self):
     for num_rows in [0, 10, 100]:
         names = [str(j) for j in range(num_rows)]
         name, name_length = msprime.pack_strings(names)
         flags = list(range(num_rows))
         time = list(range(num_rows))
         table = msprime.NodeTable()
         table.set_columns(name=name,
                           name_length=name_length,
                           flags=flags,
                           time=time)
         self.assertEqual(list(table.population),
                          [-1 for _ in range(num_rows)])
         self.assertEqual(list(table.flags), flags)
         self.assertEqual(list(table.time), time)
         self.assertEqual(list(table.name), list(name))
         self.assertEqual(list(table.name_length), list(name_length))
Exemple #6
0
 def test_random_names(self):
     for num_rows in [0, 10, 100]:
         names = [random_string(10) for _ in range(num_rows)]
         name, name_length = msprime.pack_strings(names)
         flags = list(range(num_rows))
         time = list(range(num_rows))
         table = msprime.NodeTable()
         table.set_columns(name=name,
                           name_length=name_length,
                           flags=flags,
                           time=time)
         self.assertEqual(list(table.flags), flags)
         self.assertEqual(list(table.time), time)
         self.assertEqual(list(table.name), list(name))
         self.assertEqual(list(table.name_length), list(name_length))
         unpacked_names = msprime.unpack_strings(table.name,
                                                 table.name_length)
         self.assertEqual(names, unpacked_names)
Exemple #7
0
def node_name_example():
    ts = msprime.simulate(sample_size=100,
                          recombination_rate=0.1,
                          length=10,
                          random_seed=1)
    nodes = msprime.NodeTable()
    edgesets = msprime.EdgesetTable()
    ts.dump_tables(nodes=nodes, edgesets=edgesets)
    new_nodes = msprime.NodeTable()
    names = ["n_{}".format(u) for u in range(ts.num_nodes)]
    packed, length = msprime.pack_strings(names)
    new_nodes.set_columns(name=packed,
                          name_length=length,
                          flags=nodes.flags,
                          time=nodes.time)
    return msprime.load_tables(nodes=new_nodes,
                               edgesets=edgesets,
                               provenance_strings=[b"sdf"])
Exemple #8
0
    def finalise(self):
        if self.genotypes_buffer is None:
            raise ValueError("Cannot call finalise in read-mode")
        variant_sites = []
        num_samples = self.num_samples
        num_sites = len(self.site_buffer)
        if num_sites == 0:
            raise ValueError("Must have at least one site")
        position = np.empty(num_sites)
        frequency = np.empty(num_sites, dtype=np.uint32)
        ancestral_states = []
        derived_states = []
        for j, site in enumerate(self.site_buffer):
            position[j] = site.position
            frequency[j] = site.frequency
            if site.frequency > 1 and site.frequency < num_samples:
                variant_sites.append(j)
            ancestral_states.append(site.alleles[0])
            derived_states.append(
                "" if len(site.alleles) < 2 else site.alleles[1])
        sites_group = self.data.create_group("sites")
        sites_group.array("position",
                          data=position,
                          chunks=(num_sites, ),
                          compressor=self.compressor)
        sites_group.array("frequency",
                          data=frequency,
                          chunks=(num_sites, ),
                          compressor=self.compressor)

        ancestral_state, ancestral_state_offset = msprime.pack_strings(
            ancestral_states)
        sites_group.array("ancestral_state",
                          data=ancestral_state,
                          chunks=(num_sites, ),
                          compressor=self.compressor)
        sites_group.array("ancestral_state_offset",
                          data=ancestral_state_offset,
                          chunks=(num_sites + 1, ),
                          compressor=self.compressor)
        derived_state, derived_state_offset = msprime.pack_strings(
            derived_states)
        sites_group.array("derived_state",
                          data=derived_state,
                          chunks=(num_sites, ),
                          compressor=self.compressor)
        sites_group.array("derived_state_offset",
                          data=derived_state_offset,
                          chunks=(num_sites + 1, ),
                          compressor=self.compressor)

        num_singletons = len(self.singletons_buffer)
        singleton_sites = np.array(
            [site for site, _ in self.singletons_buffer], dtype=np.int32)
        singleton_samples = np.array(
            [sample for _, sample in self.singletons_buffer], dtype=np.int32)
        singletons_group = self.data.create_group("singletons")
        chunks = max(num_singletons, 1),
        singletons_group.array("site",
                               data=singleton_sites,
                               chunks=chunks,
                               compressor=self.compressor)
        singletons_group.array("sample",
                               data=singleton_samples,
                               chunks=chunks,
                               compressor=self.compressor)

        num_invariants = len(self.invariants_buffer)
        invariant_sites = np.array(self.invariants_buffer, dtype=np.int32)
        invariants_group = self.data.create_group("invariants")
        chunks = max(num_invariants, 1),
        invariants_group.array("site",
                               data=invariant_sites,
                               chunks=chunks,
                               compressor=self.compressor)

        num_variant_sites = len(variant_sites)
        self.data.attrs["num_sites"] = num_sites
        self.data.attrs["num_variant_sites"] = num_variant_sites
        self.data.attrs["num_singleton_sites"] = num_singletons
        self.data.attrs["num_invariant_sites"] = num_invariants

        chunks = max(num_variant_sites, 1),
        self.variants_group.create_dataset("site",
                                           shape=(num_variant_sites, ),
                                           chunks=chunks,
                                           dtype=np.int32,
                                           data=variant_sites,
                                           compressor=self.compressor)

        self.genotypes.append(
            self.genotypes_buffer[:self.genotypes_buffer_offset])
        self.site_buffer = None
        self.genotypes_buffer = None
        super(SampleData, self).finalise()