def _initializeIndividualTable(self, tc): """ Returns node ID -> individual map """ # First, alive individuals: individal_nodes = {} for i in range(self.N): individal_nodes[2 * i] = i individal_nodes[2 * i + 1] = i metadata_strings = _generate_individual_metadata(self.diploid_metadata, tc) # Now, preserved nodes num_ind_nodes = self.N for i in self.ancient_sample_metadata: assert i not in individal_nodes, "indivudal record error" individal_nodes[i.nodes[0]] = num_ind_nodes individal_nodes[i.nodes[1]] = num_ind_nodes num_ind_nodes += 1 metadata_strings.extend( _generate_individual_metadata(self.ancient_sample_metadata, tc)) md, mdo = tskit.pack_bytes(metadata_strings) flags = [0 for i in range(self.N + len(self.ancient_sample_metadata))] tc.individuals.set_columns(flags=flags, metadata=md, metadata_offset=mdo) return individal_nodes
def test_pickle(self): ts = msprime.simulate(10, random_seed=1) tables = ts.dump_tables() # For each node, we create some Python metadata that can be pickled metadata = [{ "one": j, "two": 2 * j, "three": list(range(j)) } for j in range(ts.num_nodes)] encoded, offset = tskit.pack_bytes(list(map(pickle.dumps, metadata))) tables.nodes.set_columns( flags=tables.nodes.flags, time=tables.nodes.time, population=tables.nodes.population, metadata_offset=offset, metadata=encoded, ) self.assertTrue(np.array_equal(tables.nodes.metadata_offset, offset)) self.assertTrue(np.array_equal(tables.nodes.metadata, encoded)) ts1 = tables.tree_sequence() for j, node in enumerate(ts1.nodes()): decoded_metadata = pickle.loads(node.metadata) self.assertEqual(decoded_metadata, metadata[j]) ts1.dump(self.temp_file) ts2 = tskit.load(self.temp_file) self.assertEqual(ts1.tables.nodes, ts2.tables.nodes)
def _initializePopulationTable(node_view, tc): population_metadata = [] for i in sorted(np.unique(node_view['population'])): md = "deme" + str(i) population_metadata.append(md.encode("utf-8")) pmd, pmdo = tskit.pack_bytes(population_metadata) tc.populations.set_columns(metadata=pmd, metadata_offset=pmdo)
def _upgrade_old_tables(tables): with warnings.catch_warnings(): warnings.simplefilter("ignore") provenance = get_provenance(tables) file_version = provenance.file_version slim_generation = provenance.slim_generation warnings.warn( "This is an version {} SLiM tree sequence.".format(file_version) + " When you write this out, " + "it will be converted to version {}.".format(slim_file_version)) if file_version == "0.1" or file_version == "0.2": # add empty nucleotide slots to metadata mut_bytes = tskit.unpack_bytes(tables.mutations.metadata, tables.mutations.metadata_offset) mut_metadata = [ _decode_mutation_pre_nucleotides(md) for md in mut_bytes ] metadata, metadata_offset = tskit.pack_bytes(mut_metadata) tables.mutations.set_columns( site=tables.mutations.site, node=tables.mutations.node, parent=tables.mutations.parent, derived_state=tables.mutations.derived_state, derived_state_offset=tables.mutations.derived_state_offset, metadata=metadata, metadata_offset=metadata_offset) if file_version == "0.1": # shift times node_times = tables.nodes.time + slim_generation tables.nodes.set_columns(flags=tables.nodes.flags, time=node_times, population=tables.nodes.population, individual=tables.nodes.individual, metadata=tables.nodes.metadata, metadata_offset=tables.nodes.metadata_offset) migration_times = tables.migrations.time + slim_generation tables.migrations.set_columns(left=tables.migrations.left, right=tables.migrations.right, node=tables.migrations.node, source=tables.migrations.source, dest=tables.migrations.dest, time=migration_times) new_record = { "schema_version": "1.0.0", "software": { "name": "pyslim", "version": pyslim_version, }, "parameters": { "command": ["_upgrade_old_tables"], "old_file_version": file_version, "new_file_version": slim_file_version, }, "environment": get_environment(), } tskit.validate_provenance(new_record) tables.provenances.add_row(json.dumps(new_record))
def _set_sites_mutations(tables): ''' Adds to a TableCollection the information relevant to mutations required for SLiM to load in a tree sequence. This means adding to the metadata column of the Mutation table, It will also - give SLiM IDs to each mutation - round Site positions to integer values - stack any mutations that end up at the same position as a result - replace ancestral states with "" This will replace any information already in the metadata or derived state columns of the Mutation table. ''' num_mutations = tables.mutations.num_rows default_mut = default_slim_metadata("mutation") dsb, dso = tskit.pack_bytes([str(j) for j in range(num_mutations)]) slim_time = tables.metadata["SLiM"]["generation"] - tables.mutations.time mms = tables.mutations.metadata_schema mutation_metadata = [ mms.encode_row({ "mutation_list": [{ "mutation_type": default_mut["mutation_type"], "selection_coeff": default_mut["selection_coeff"], "subpopulation": default_mut["subpopulation"], "slim_time": st, "nucleotide": default_mut["nucleotide"] }] }) for st in slim_time ] mdb, mdo = tskit.pack_bytes(mutation_metadata) tables.mutations.set_columns(site=tables.mutations.site, node=tables.mutations.node, time=tables.mutations.time, derived_state=dsb, derived_state_offset=dso, parent=tables.mutations.parent, metadata=mdb, metadata_offset=mdo) tables.sites.set_columns(position=tables.sites.position, ancestral_state=np.array([], dtype='int8'), ancestral_state_offset=np.zeros( tables.sites.num_rows + 1, dtype='uint32'))
def _generate_mutation_metadata(pop): muts = [] for mr in pop.tables.mutations: m = pop.mutations[mr.key] d = { 's': m.s, 'h': m.h, # 'g': m.g, 'label': m.label, 'esizes': list(m.esizes), 'heffects': list(m.heffects), 'neutral': m.neutral } muts.append(str(d).encode('utf-8')) return tskit.pack_bytes(muts)
def _generate_mutation_metadata(self): muts = [] for mr in self.tables.mutations: m = self.mutations[mr.key] d = { 's': m.s, 'h': m.h, 'age': self.generation - m.g + 1, 'label': m.label, 'esizes': list(m.esizes), 'heffects': list(m.heffects), 'neutral': m.neutral, 'key': mr.key } muts.append(str(d).encode('utf-8')) return tskit.pack_bytes(muts)
def posterior_mean_var(ts, timepoints, posterior, Ne, *, fixed_node_set=None): """ Mean and variance of node age in scaled time. Fixed nodes will be given a mean of their exact time in the tree sequence, and zero variance (as long as they are identified by the fixed_node_set If fixed_node_set is None, we attempt to date all the non-sample nodes Also assigns the estimated mean and variance of the age of each node, in unscaled time, as metadata in the tree sequence. """ mn_post = np.full(ts.num_nodes, np.nan) # Fill with NaNs so we detect when there's vr_post = np.full(ts.num_nodes, np.nan) # been an error tables = ts.dump_tables() if fixed_node_set is None: fixed_node_set = ts.samples() fixed_nodes = np.array(list(fixed_node_set)) mn_post[fixed_nodes] = tables.nodes.time[fixed_nodes] vr_post[fixed_nodes] = 0 metadata_array = tskit.unpack_bytes(ts.tables.nodes.metadata, ts.tables.nodes.metadata_offset) timepoints = timepoints * 2 * Ne for row, node_id in zip(posterior.grid_data, posterior.nonfixed_nodes): mn_post[node_id] = np.sum(row * timepoints) / np.sum(row) vr_post[node_id] = np.sum( ((mn_post[node_id] - (timepoints))**2) * (row / np.sum(row))) metadata_array[node_id] = json.dumps({ "mn": mn_post[node_id], "vr": vr_post[node_id] }).encode() md, md_offset = tskit.pack_bytes(metadata_array) tables.nodes.set_columns( flags=tables.nodes.flags, time=tables.nodes.time, population=tables.nodes.population, individual=tables.nodes.individual, metadata=md, metadata_offset=md_offset, ) ts = tables.tree_sequence() return ts, mn_post, vr_post
def combine_chromosome_arms(args): """ Splices two chromosome arms together to form a full chromosome """ short_arm = tskit.load(args.p_arm) long_arm = tskit.load(args.q_arm) assert short_arm.num_samples == long_arm.num_samples # Remove material before first position and after last position short_arm = short_arm.keep_intervals( [[ short_arm.tables.sites.position[0] - 1, short_arm.tables.sites.position[-1] + 1, ]], simplify=False, ) long_arm = long_arm.keep_intervals( [[ long_arm.tables.sites.position[0] - 1, long_arm.tables.sites.position[-1] + 1, ]], simplify=False, ) short_tables = short_arm.dump_tables() long_tables = long_arm.dump_tables() assert np.array_equal(short_tables.individuals.metadata, long_tables.individuals.metadata) short_tables.sequence_length = long_arm.get_sequence_length() short_metadata = short_tables.nodes.metadata short_metadata_offset = short_tables.nodes.metadata_offset short_metadata = tskit.unpack_bytes(short_metadata, short_metadata_offset) long_metadata = long_tables.nodes.metadata long_metadata_offset = long_tables.nodes.metadata_offset long_metadata = tskit.unpack_bytes(long_metadata, long_metadata_offset) long_metadata = long_metadata[long_arm.num_samples:] combined_metadata = np.concatenate([short_metadata, long_metadata]) metadata, metadata_offset = tskit.pack_bytes(combined_metadata) all_nodes_except_samples = ~np.isin(np.arange(long_arm.num_nodes), long_arm.samples()) short_tables.nodes.append_columns( long_tables.nodes.flags[all_nodes_except_samples], long_tables.nodes.time[all_nodes_except_samples], long_tables.nodes.population[all_nodes_except_samples], ) short_tables.nodes.set_columns( flags=short_tables.nodes.flags, time=short_tables.nodes.time, population=short_tables.nodes.population, metadata=metadata, individual=short_tables.nodes.individual, metadata_offset=metadata_offset, ) long_edges_parent = long_tables.edges.parent long_edges_child = long_tables.edges.child long_arm_sample_map = np.zeros(long_arm.num_nodes).astype(int) long_arm_sample_map[long_arm.samples()] = short_arm.samples() long_edges_parent[~np.isin(long_edges_parent, long_arm.samples( ))] = long_edges_parent[~np.isin(long_edges_parent, long_arm.samples() )] + (short_arm.num_nodes) long_edges_parent[ long_arm.tables.edges.parent > long_arm.samples()[-1]] = ( long_edges_parent[ long_arm.tables.edges.parent > long_arm.samples()[-1]] - long_arm.num_samples) long_edges_child[~np.isin(long_edges_child, long_arm.samples( ))] = long_edges_child[~np.isin(long_edges_child, long_arm.samples())] + ( short_arm.num_nodes) long_edges_child[long_tables.edges.child > long_arm.samples()[-1]] = ( long_edges_child[long_tables.edges.child > long_arm.samples()[-1]] - long_arm.num_samples) long_edges_child[np.isin( long_tables.edges.child, long_arm.samples())] = long_arm_sample_map[ long_tables.edges.child[np.isin(long_tables.edges.child, long_arm.samples())]] short_tables.edges.append_columns( long_tables.edges.left, long_tables.edges.right, long_edges_parent, long_edges_child, ) short_tables.sites.append_columns( long_tables.sites.position, long_tables.sites.ancestral_state, long_tables.sites.ancestral_state_offset, ) long_mutations_node = long_tables.mutations.node long_mutations_node[~np.isin(long_mutations_node, long_arm.samples( ))] = long_mutations_node[~np.isin(long_mutations_node, long_arm.samples() )] + (short_arm.num_nodes) long_mutations_node[ long_tables.mutations.node > long_arm.samples()[-1]] = ( long_mutations_node[ long_tables.mutations.node > long_arm.samples()[-1]] - long_arm.num_samples) long_mutations_node[np.isin(long_tables.mutations.node, long_arm.samples())] = long_arm_sample_map[ long_tables.mutations.node[np.isin( long_tables.mutations.node, long_arm.samples())]] short_tables.mutations.append_columns( long_tables.mutations.site + short_arm.num_sites, long_mutations_node, long_tables.mutations.derived_state, long_tables.mutations.derived_state_offset, ) short_tables.sort() combined = short_tables.tree_sequence() assert combined.num_nodes == (short_arm.num_nodes + long_arm.num_nodes - short_arm.num_samples) assert combined.num_sites == (short_arm.num_sites + long_arm.num_sites) assert combined.num_edges == (short_arm.num_edges + long_arm.num_edges) assert combined.num_mutations == (short_arm.num_mutations + long_arm.num_mutations) assert (combined.num_individuals == short_arm.num_individuals == long_arm.num_individuals) assert np.array_equal( np.sort(combined.tables.sites.position), np.concatenate( [short_arm.tables.sites.position, long_arm.tables.sites.position]), ) assert np.array_equal( np.sort(combined.tables.nodes.time[combined.tables.mutations.node]), np.sort( np.concatenate([ short_arm.tables.nodes.time[short_arm.tables.mutations.node], long_arm.tables.nodes.time[long_arm.tables.mutations.node], ])), ) assert np.array_equal(combined.tables.individuals.metadata, long_tables.individuals.metadata) combined.dump(args.output)
def _set_nodes_individuals(tables, age): ''' Adds to a TableCollection the information relevant to individuals required for SLiM to load in a tree sequence, that is found in Node and Individual tables. This will replace any existing Individual table, and will replace any information already in the individual, metadata, and population columns of the Node table. This is designed to make it easy to assign default values: - (node_ind) the 2*j-th and (2*j+1)-st `sample` nodes to individual j - (location) individual locations to (0, 0, 0) - (age) individual age to 0 - (ind_id) SLiM individual pedigree IDs to sequential integers starting from 0 - (ind_population) individual populations to 0 - (node_id) SLiM genome IDs to sequential integers starting with samples from 0 - (node_is_null) genomes to be non-null - (node_type) genome type to 0 (= autosome) - (ind_flags) INDIVIDUAL_ALIVE If you have other situations, like non-alive "remembered" individuals, you will need to edit the tables by hand, afterwards. ''' samples = np.where(tables.nodes.flags & tskit.NODE_IS_SAMPLE)[0] if (len(samples) % 2) != 0: raise ValueError("There must be an even number of sampled nodes,"\ + "since organisms are diploid.") num_individuals = int(len(samples) / 2) node_ind = np.repeat(tskit.NULL, tables.nodes.num_rows).astype("int32") node_ind[samples] = np.arange(len(samples)) // 2 ind_id = np.arange(num_individuals) slim_node_id = np.repeat(tskit.NULL, tables.nodes.num_rows) slim_node_id[samples] = np.arange(len(samples)) ind_population = np.repeat(tskit.NULL, num_individuals) ind_population[node_ind[samples]] = tables.nodes.population[samples] if not np.all(unique_labels_by_group(node_ind, tables.nodes.population)): raise ValueError("Individual has nodes from more than one population.") if not np.all(unique_labels_by_group(node_ind, tables.nodes.time)): raise ValueError("Individual has nodes from more than one time.") loc_vec = np.zeros(num_individuals * 3).astype("float64") loc_off = 3 * np.arange(num_individuals + 1).astype("uint32") ind_flags = np.repeat(INDIVIDUAL_ALIVE, num_individuals).astype("uint32") default_ind = default_slim_metadata("individual") sex = default_ind['sex'] slim_flag = default_ind['flags'] ims = tables.individuals.metadata_schema individual_metadata = [ ims.encode_row({ 'pedigree_id': iid, 'age': age, 'subpopulation': int(pop), 'sex': sex, 'flags': slim_flag }) for (iid, pop) in zip(ind_id, ind_population) ] imb, imo = tskit.pack_bytes(individual_metadata) tables.individuals.set_columns(flags=ind_flags, location=loc_vec, location_offset=loc_off, metadata=imb, metadata_offset=imo) assert (tables.individuals.num_rows == num_individuals) default_node = default_slim_metadata("node") node_is_null = default_node["is_null"] node_type = default_node["genome_type"] nms = tables.nodes.metadata_schema node_metadata = [b'' for _ in range(tables.nodes.num_rows)] for j in samples: node_metadata[j] = nms.encode_row({ 'slim_id': slim_node_id[j], 'is_null': node_is_null, 'genome_type': node_type }) nmb, nmo = tskit.pack_bytes(node_metadata) tables.nodes.set_columns(flags=tables.nodes.flags, time=tables.nodes.time, population=tables.nodes.population, individual=node_ind, metadata=nmb, metadata_offset=nmo)
def _set_nodes_individuals( tables, node_ind=None, location=(0, 0, 0), age=0, ind_id=None, ind_population=None, ind_sex=INDIVIDUAL_TYPE_HERMAPHRODITE, ind_flags=INDIVIDUAL_ALIVE, slim_ind_flags=0, node_id=None, node_is_null=False, node_type=GENOME_TYPE_AUTOSOME): ''' Adds to a TableCollection the information relevant to individuals required for SLiM to load in a tree sequence, that is found in Node and Individual tables. This will replace any existing Individual table, and will replace any information already in the individual, metadata, and population columns of the Node table. This is designed to make it easy to assign default values: - (node_ind) the 2*j-th and (2*j+1)-st `sample` nodes to individual j - (location) individual locations to (0, 0, 0) - (age) individual age to 0 - (ind_id) SLiM individual pedigree IDs to sequential integers starting from 0 - (ind_population) individual populations to 0 - (node_id) SLiM genome IDs to sequential integers starting with samples from 0 - (node_is_null) genomes to be non-null - (node_type) genome type to 0 (= autosome) - (ind_flags) INDIVIDUAL_ALIVE If you have other situations, like non-alive "remembered" individuals, you will need to edit the tables by hand, afterwards. ''' samples = list(filter(lambda j: tables.nodes.flags[j] & tskit.NODE_IS_SAMPLE, range(tables.nodes.num_rows))) if (len(samples) % 2) != 0: raise ValueError("There must be an even number of sampled nodes,"\ + "since organisms are diploid.") if node_ind is None: node_ind = [tskit.NULL for _ in range(tables.nodes.num_rows)] for j, k in enumerate(samples): node_ind[j] = int(k/2) num_individuals = max(node_ind) + 1 num_nodes = tables.nodes.num_rows if type(location) is tuple: location = [location for _ in range(num_individuals)] assert(len(location) == num_individuals) if type(age) is int or type(age) is float: age = [age for _ in range(num_individuals)] assert(len(age) == num_individuals) if ind_id is None: ind_id = list(range(num_individuals)) assert(len(ind_id) == num_individuals) if type(ind_sex) is int: ind_sex = [ind_sex for _ in range(num_individuals)] assert(len(ind_sex) == num_individuals) if type(slim_ind_flags) is int: slim_ind_flags = [slim_ind_flags for _ in range(num_individuals)] assert(len(slim_ind_flags) == num_individuals) if type(ind_flags) is int: ind_flags = [ind_flags for _ in range(num_individuals)] assert(len(ind_flags) == num_individuals) if node_id is None: node_id = [-1 for _ in range(num_nodes)] for j, k in enumerate(list(samples) + sorted(list(set(range(num_nodes)) - set(samples)))): node_id[k] = j assert(len(node_id) == num_nodes) if type(node_is_null) is bool: node_is_null = [node_is_null for _ in range(num_nodes)] assert(len(node_is_null) == num_nodes) if type(node_type) is int: node_type = [node_type for _ in range(num_nodes)] assert(len(node_type) == tables.nodes.num_rows) if ind_population is None: # set the individual populations based on what's in the nodes ind_population = [tskit.NULL for _ in range(num_individuals)] for j, u in enumerate(node_ind): if u >= 0: ind_population[u] = tables.nodes.population[j] assert(len(ind_population) == num_individuals) # check for consistency: every individual has two nodes, and populations agree ploidy = [0 for _ in range(num_individuals)] for j in samples: u = node_ind[j] assert(u >= 0) ploidy[u] += 1 if tables.nodes.population[j] != ind_population[u]: raise ValueError("Inconsistent populations: nodes and individuals do not agree.") if any([p != 2 for p in ploidy]): raise ValueError("Not all individuals have two assigned nodes.") tables.nodes.set_columns(flags=tables.nodes.flags, time=tables.nodes.time, population=tables.nodes.population, individual=node_ind, metadata=tables.nodes.metadata, metadata_offset=tables.nodes.metadata_offset) loc_vec, loc_off = tskit.pack_bytes(location) tables.individuals.set_columns( flags=ind_flags, location=loc_vec, location_offset=loc_off) individual_metadata = [IndividualMetadata(*x) for x in zip(ind_id, age, ind_population, ind_sex, slim_ind_flags)] node_metadata = [None for _ in range(num_nodes)] for j in samples: node_metadata[j] = NodeMetadata(slim_id=node_id[j], is_null=node_is_null[j], genome_type=node_type[j]) annotate_individual_metadata(tables, individual_metadata) annotate_node_metadata(tables, node_metadata)