def __init__(self, data, snp, sample_id): ''' Construct a genotype set from data arrays: - snp: SNP metadata record array (contains chromosome, name, morgans, base-pair location) - data: a 3-D genotype data array: (individual x SNP x allele) - sample_id: genotyped individuals' ID set ''' # People's IDs self.sample_id = sample_id self.data = data self._num_snps = self.data.shape[0] self._num_samples = self.data.shape[1] self._snp_range = None # SNP metadata: SNP label, chromosome number, Genetic distance in Morgans, and # base pair location for each SNP self.snp = snp # Base-pair-location to snp-index map, lazily-initialized + cached base_pair = self.snp['base_pair'] self._base_pair = base_pair # np.array([int(base_pair)]) if base_pair.size == 1 else base_pair self._bp_to_snp = dict_invert(dict(enumerate(self._base_pair))) # Construct a BST for fast bp queries self._snp_tree = BinarySearchTree(values=self._base_pair[optimal_insertion_order(self._num_snps)]) self._snp_index_tree = util.list_index_tree(self._base_pair) # A genetic map: lists the two allele letters corresponding to 1 and 2 for each SNP, according # their order in the self.snp array. self.map = [] # General metadata, for easy handling of CGI data self.metadata = [] # samples for which the parent-of-origin phase is determined self.poo_phase = np.zeros((self._num_samples,), dtype=np.byte)
def sub_pedigree(self, samples): '''Return a sub-pedigree with the specified sample array only.''' if isinstance(samples, list): samples = np.array(samples) num_samples = len(samples) genotyped = np.where(samples < self.num_genotyped)[0] num_genotyped = len(genotyped) samples = samples[np.concatenate((genotyped, np.where(samples >= self.num_genotyped)[0]))] # Create family pedigree graph # Original family IDs are self.sample_id[i] if ever needed nodes = range(0, num_samples) recoding = dict(zip(samples, nodes)) g = self.graph.subgraph(samples) sub_graph = nx.DiGraph() sub_graph.add_nodes_from(nodes) for parent_type in im.constants.ALLELES: sub_graph.add_edges_from(((recoding[n1], recoding[n2]) for n1, n2 in g.edges_iter() if g.edge[n1][n2]['type'] == parent_type), type=parent_type) family_index = util.dict_invert(recoding) sample_index = [family_index[j] for j in nodes] sample_id = np.array([self.sample_id[family_index[j]] for j in nodes]) sex = np.array([self.sex[family_index[j]] for j in nodes]) phenotype = np.array([self.phenotype[family_index[j]] for j in nodes]) node_type = np.array([self.node_type[family_index[j]] for j in nodes]) return Pedigree(sub_graph, sample_id=sample_id, sex=sex, phenotype=phenotype, node_type=node_type, sample_index=sample_index, num_genotyped=num_genotyped)
def sub_problem(self, samples, snps=None): '''Return a sub-problem that contains a subset 'samples' of the genotyped nodes.''' # Re-order samples so that all genotyped appear before all non-genotyped if isinstance(samples, list): samples = np.array(samples) # Create pedigree object p = self.sub_pedigree(samples) genotyped = np.where(samples < self.pedigree.num_genotyped)[0] num_genotyped = len(genotyped) samples = samples[np.concatenate((genotyped, np.where(samples >= self.pedigree.num_genotyped)[0]))] # Create deep copies of relevant parts of data arrays g, h = self.data g_snp, h_snp, qc = self.genotype.snp, self.haplotype.snp, self.haplotype.qc frames = self.frames if snps is not None: g, h, qc = g[snps, :, :], h[snps, :, :], qc[snps, :, :] if qc.size else None g_snp, h_snp = g_snp[snps], g_snp[snps] # Restrict frames to snps, convert to new SNP indices orig_snp = dict((v, k) for k, v in enumerate(snps)) def sub_frame(frame): for x in frame: if orig_snp.has_key(x): yield orig_snp[x] frames = util.mdict() for k, v in frames.iteritems(): for frame in v: frames[k] = sub_frame(frame) genotyped = p.sample_index[0:num_genotyped] g, h = g[:, genotyped, :].copy(), h[:, genotyped, :].copy() if qc.size: qc = qc[:, genotyped, ].copy() g_snp, h_snp = g_snp.copy(), h_snp.copy() # Build sub-problem object graph g = im.factory.GenotypeFactory.new_instance('genotype', g, g_snp) h = im.factory.GenotypeFactory.new_instance('haplotype', h, h_snp) h.qc = qc # Build restricted info object error = self.error[:, genotyped].copy() if self.error.size else self.error if snps is not None and error.size: error = error[snps, :] sample_set = set(samples) sample_index_map = util.dict_invert(dict(enumerate(p.sample_index))) ibd = im.segment.SegmentSet(im.segment.Segment(x.snp, map(lambda y: (sample_index_map[y[0]], y[1]), x.samples), x.bp, error_snps=x.error_snps) for x in self.info.ibd if (sample_set >= set([y[0] for y in x.samples]))) info = ProblemInfo(p, g, snp=(self.info.snp[snps] if snps is not None else self.info.snp), ibd=ibd) return Problem(p, g, haplotype=h, info=info, error=error, frames=frames, lam=self.lam)
discordant = len(np.where(ra != rb)[0]) return (1.0 * concordant) / (concordant + discordant), concordant, discordant def concordance_genotypes(a, b): '''Return the concordance between the nx2 genotype arrays a and b. Only takes into account entries where both genotypes are fully called.''' return concordance_recoded(recode_single_genotype(a), recode_single_genotype(b)) #--------------------------------------------- # PLINK- and CGI-related recoding #--------------------------------------------- # CGI genotype coding CGI_MISSING_LETTER = 'N' CGI_GENOTYPES = [x[0] + x[1] for x in list(it.product(CGI_MISSING_LETTER + '01', CGI_MISSING_LETTER + '01'))] # CGI letter to PLINK recode12 - conversion table CGI_LETTER_TO_ALLELE = {'N': 0, '0': 1, '1': 2} ALLELE_TO_CGI_LETTER = util.dict_invert(CGI_LETTER_TO_ALLELE) '''Convert 0,1,2 allele values in a genotype data array to CGI letters N,0,1.''' recode_cgi = lambda g: np.vectorize(lambda x: ALLELE_TO_CGI_LETTER[x])(g) CGI_LETTER_TO_ALLELE_FLIPPED = {'N': 0, '0': 2, '1': 1} ALLELE_TO_CGI_LETTER_FLIPPED = util.dict_invert(CGI_LETTER_TO_ALLELE_FLIPPED) '''Same as recode_cgi, only that alleles are also flipped (1->2 and 2->1).''' recode_cgi_flipped = lambda g: np.vectorize(lambda x: ALLELE_TO_CGI_LETTER_FLIPPED[x])(g) '''Convert CGI letter genotype to PLINK 1-2 allele coding.''' recode12 = lambda g: str(CGI_LETTER_TO_ALLELE[g[0]]) + ' ' + str(CGI_LETTER_TO_ALLELE[g[1]])
from utils.pointer import Pointer, OverlapError from utils.pointer import Visit # **** BEGIN CLASS CC2BMS = dict_from( CC, VOLUME=0, PITCH=1, PAN=3 ) # type: Dict[CC, int] BMS2CC = dict_invert(CC2BMS) # type: Dict[int, CC] # BmsTrack = None # fixme class EndException(Exception): pass @register(0xC1) class Child(Event): keys = [u8('tracknum'), u24('addr')] def after(self, track: BmsTrack): BmsTrack(track._file, self.addr, self.tracknum, None).parse()
def __read_pedigree(file_name, genotyped_ids=None): '''Load pedigree from the reads from the PLINK TFAM file file_name. If a list of genotyped ids genotyped_ids is specified, the pedigree node IDs are recoded to consecutive non-negative integers, i.e., the original plink ID list is mapped to the node list 1..(len(plink_id)). If genotype IDs are available, set node type to genotyped/not genotyped using those values; otherwise, fall back to the data[:,5] column.''' # Load data from text file_name data = np.genfromtxt(file_name, dtype=np.dtype(int), usecols=range(1, 6)) nodes, missing = data[:, 0], MISSING num_samples, num_columns = data.shape if genotyped_ids is None: sample_id = nodes order = np.arange(0, num_samples) num_genotyped = len(sample_id) node_type = data[:, 5] if num_columns >= 6 else np.tile( constants.INDETERMINATE, (num_samples, 1)) node_to_sample_id = None else: # Recode nodes if a genotyped ID list was specified # Dummy node ID for missing data in the pedigree adjacency list. Must not be a # possible node number (typically, a negative number should work).''' missing = -1 if isinstance(genotyped_ids, np.ndarray): genotyped_ids = genotyped_ids.tolist() rest = sorted(set(nodes) - set(genotyped_ids)) # Keep track of original IDs sample_id = np.array([missing] + genotyped_ids + list(rest)) recoding = dict( zip(sample_id, [missing] + range(Pedigree.START_ID, Pedigree.START_ID + data.shape[0]))) # Recode ID columns (first three); since array is thin and long, recode columns and # then transpose num_id_columns = 3 data = np.concatenate( (np.array([[ recoding[x] if recoding.has_key(x) else missing for x in data[:, j] ] for j in xrange(0, num_id_columns) ]).transpose(), data[:, num_id_columns:]), axis=1) nodes = data[:, 0] # Remove the sample_id entry of the missing value # sample_id = sample_id[1:] node_to_sample_id = util.dict_invert(recoding) num_genotyped = len(genotyped_ids) node_type = np.array([(Person.TYPE.GENOTYPED if x < num_genotyped else Person.TYPE.NOT_GENOTYPED) for x in xrange(0, num_samples)]) # Construct _graph graph = nx.DiGraph() # Add all nodes graph.add_nodes_from(nodes) # Add father->child edges, mother->child edges for (parent_type, column) in {PATERNAL: 1, MATERNAL: 2}.iteritems(): graph.add_edges_from(data[:, (column, 0)], type=parent_type) # Remove missing data = edges from nodes to the dummy node 'missing' graph.remove_node(missing) if genotyped_ids is not None: # After graph might have reordered nodes, reorder sample_ids accordingly node_to_line = dict(zip(nodes, graph.nodes())) sample_id = np.array([node_to_sample_id[x] for x in graph.nodes()]) order = np.array([node_to_line[x] for x in graph.nodes()]) # Check that in-degree is at most 2 if np.nonzero(np.array(graph.in_degree().values()) > 2)[0]: raise ValueError( 'Input data contains a child node with more than two parents') # If genotype IDs are available, set node type to genotyped/not genotyped using those values; # otherwise, fall back to the data column return Pedigree(graph, sample_id=sample_id, num_genotyped=num_genotyped, sex=data[order, 3], phenotype=data[order, 4], node_type=node_type)
def __read_pedigree(file_name, genotyped_ids=None): '''Load pedigree from the reads from the PLINK TFAM file file_name. If a list of genotyped ids genotyped_ids is specified, the pedigree node IDs are recoded to consecutive non-negative integers, i.e., the original plink ID list is mapped to the node list 1..(len(plink_id)). If genotype IDs are available, set node type to genotyped/not genotyped using those values; otherwise, fall back to the data[:,5] column.''' # Load data from text file_name data = np.genfromtxt(file_name, dtype=np.dtype(int), usecols=range(1, 6)) nodes, missing = data[:, 0], MISSING num_samples, num_columns = data.shape if genotyped_ids is None: sample_id = nodes order = np.arange(0, num_samples) num_genotyped = len(sample_id) node_type = data[:, 5] if num_columns >= 6 else np.tile(constants.INDETERMINATE, (num_samples, 1)) node_to_sample_id = None else: # Recode nodes if a genotyped ID list was specified # Dummy node ID for missing data in the pedigree adjacency list. Must not be a # possible node number (typically, a negative number should work).''' missing = -1 if isinstance(genotyped_ids, np.ndarray): genotyped_ids = genotyped_ids.tolist() rest = sorted(set(nodes) - set(genotyped_ids)) # Keep track of original IDs sample_id = np.array([missing] + genotyped_ids + list(rest)) recoding = dict(zip(sample_id, [missing] + range(Pedigree.START_ID, Pedigree.START_ID + data.shape[0]))) # Recode ID columns (first three); since array is thin and long, recode columns and # then transpose num_id_columns = 3 data = np.concatenate((np.array([[recoding[x] if recoding.has_key(x) else missing for x in data[:, j]] for j in xrange(0, num_id_columns)]).transpose(), data[:, num_id_columns:]), axis=1) nodes = data[:, 0] # Remove the sample_id entry of the missing value # sample_id = sample_id[1:] node_to_sample_id = util.dict_invert(recoding) num_genotyped = len(genotyped_ids) node_type = np.array([(Person.TYPE.GENOTYPED if x < num_genotyped else Person.TYPE.NOT_GENOTYPED) for x in xrange(0, num_samples)]) # Construct _graph graph = nx.DiGraph() # Add all nodes graph.add_nodes_from(nodes) # Add father->child edges, mother->child edges for (parent_type, column) in {PATERNAL: 1, MATERNAL: 2}.iteritems(): graph.add_edges_from(data[:, (column, 0)], type=parent_type) # Remove missing data = edges from nodes to the dummy node 'missing' graph.remove_node(missing) if genotyped_ids is not None: # After graph might have reordered nodes, reorder sample_ids accordingly node_to_line = dict(zip(nodes, graph.nodes())) sample_id = np.array([node_to_sample_id[x] for x in graph.nodes()]) order = np.array([node_to_line[x] for x in graph.nodes()]) # Check that in-degree is at most 2 if np.nonzero(np.array(graph.in_degree().values()) > 2)[0]: raise ValueError('Input data contains a child node with more than two parents') # If genotype IDs are available, set node type to genotyped/not genotyped using those values; # otherwise, fall back to the data column return Pedigree(graph, sample_id=sample_id, num_genotyped=num_genotyped, sex=data[order, 3], phenotype=data[order, 4], node_type=node_type)
def test_dict_invert(self): '''Test inverting a 1:1 dictionary.''' d = dict(zip(range(0, 4), range(4, 8))) d_inv = dict(zip(range(4, 8), range(0, 4))) assert_equal(dict_invert(d), d_inv, 'Wrong dictionary inversion')