Beispiel #1
0
 def __init__(self, data, snp, sample_id):
     '''
     Construct a genotype set from data arrays:
     - snp: SNP metadata record array (contains chromosome, name, morgans, base-pair location)
     - data: a 3-D genotype data array: (individual x SNP x allele)
     - sample_id: genotyped individuals' ID set
     '''       
     # People's IDs
     self.sample_id = sample_id
     self.data = data
     self._num_snps = self.data.shape[0]
     self._num_samples = self.data.shape[1]
     self._snp_range = None
     
     # SNP metadata: SNP label, chromosome number, Genetic distance in Morgans, and
     # base pair location for each SNP
     self.snp = snp
     # Base-pair-location to snp-index map, lazily-initialized + cached
     base_pair = self.snp['base_pair']
     self._base_pair = base_pair  # np.array([int(base_pair)]) if base_pair.size == 1 else base_pair
     self._bp_to_snp = dict_invert(dict(enumerate(self._base_pair)))
     # Construct a BST for fast bp queries
     self._snp_tree = BinarySearchTree(values=self._base_pair[optimal_insertion_order(self._num_snps)])
     self._snp_index_tree = util.list_index_tree(self._base_pair)
     # A genetic map: lists the two allele letters corresponding to 1 and 2 for each SNP, according
     # their order in the self.snp array.
     self.map = []
     # General metadata, for easy handling of CGI data
     self.metadata = []
                 
     # samples for which the parent-of-origin phase is determined
     self.poo_phase = np.zeros((self._num_samples,), dtype=np.byte)
Beispiel #2
0
 def sub_pedigree(self, samples):
     '''Return a sub-pedigree with the specified sample array only.'''
     if isinstance(samples, list): samples = np.array(samples)
     num_samples = len(samples)
     genotyped = np.where(samples < self.num_genotyped)[0]
     num_genotyped = len(genotyped) 
     samples = samples[np.concatenate((genotyped, np.where(samples >= self.num_genotyped)[0]))]
     
     # Create family pedigree graph
     # Original family IDs are self.sample_id[i] if ever needed
     nodes = range(0, num_samples) 
     recoding = dict(zip(samples, nodes))
     g = self.graph.subgraph(samples)
     sub_graph = nx.DiGraph()
     sub_graph.add_nodes_from(nodes)
     for parent_type in im.constants.ALLELES:
         sub_graph.add_edges_from(((recoding[n1], recoding[n2]) for n1, n2 in g.edges_iter() 
                                   if g.edge[n1][n2]['type'] == parent_type), type=parent_type)
     family_index = util.dict_invert(recoding)
     sample_index = [family_index[j] for j in nodes]
     sample_id = np.array([self.sample_id[family_index[j]] for j in nodes])
     sex = np.array([self.sex[family_index[j]] for j in nodes])
     phenotype = np.array([self.phenotype[family_index[j]] for j in nodes])
     node_type = np.array([self.node_type[family_index[j]] for j in nodes])
     
     return Pedigree(sub_graph, sample_id=sample_id, sex=sex, phenotype=phenotype, node_type=node_type,
                     sample_index=sample_index, num_genotyped=num_genotyped)
Beispiel #3
0
    def sub_problem(self, samples, snps=None):
        '''Return a sub-problem that contains a subset 'samples' of the genotyped nodes.'''
        # Re-order samples so that all genotyped appear before all non-genotyped
        if isinstance(samples, list): samples = np.array(samples)
        # Create pedigree object
        p = self.sub_pedigree(samples)

        genotyped = np.where(samples < self.pedigree.num_genotyped)[0]
        num_genotyped = len(genotyped) 
        samples = samples[np.concatenate((genotyped, np.where(samples >= self.pedigree.num_genotyped)[0]))]

        # Create deep copies of relevant parts of data arrays
        g, h = self.data
        g_snp, h_snp, qc = self.genotype.snp, self.haplotype.snp, self.haplotype.qc
        frames = self.frames 
        if snps is not None:
            g, h, qc = g[snps, :, :], h[snps, :, :], qc[snps, :, :] if qc.size else None
            g_snp, h_snp = g_snp[snps], g_snp[snps]
            # Restrict frames to snps, convert to new SNP indices
            orig_snp = dict((v, k) for k, v in enumerate(snps))
            def sub_frame(frame):
                for x in frame:
                    if orig_snp.has_key(x): yield orig_snp[x]
            
            frames = util.mdict()
            for k, v in frames.iteritems():
                for frame in v:                    
                    frames[k] = sub_frame(frame)
            
        genotyped = p.sample_index[0:num_genotyped]
        g, h = g[:, genotyped, :].copy(), h[:, genotyped, :].copy()
        if qc.size: qc = qc[:, genotyped, ].copy()
        g_snp, h_snp = g_snp.copy(), h_snp.copy()
        
        # Build sub-problem object graph
        g = im.factory.GenotypeFactory.new_instance('genotype', g, g_snp)
        h = im.factory.GenotypeFactory.new_instance('haplotype', h, h_snp)
        h.qc = qc
        # Build restricted info object
        error = self.error[:, genotyped].copy() if self.error.size else self.error
        if snps is not None and error.size: error = error[snps, :]
        sample_set = set(samples)
        sample_index_map = util.dict_invert(dict(enumerate(p.sample_index)))
        ibd = im.segment.SegmentSet(im.segment.Segment(x.snp, map(lambda y: (sample_index_map[y[0]], y[1]), x.samples),
                                                       x.bp, error_snps=x.error_snps) for x in self.info.ibd if (sample_set >= set([y[0] for y in x.samples])))
        info = ProblemInfo(p, g, snp=(self.info.snp[snps] if snps is not None else self.info.snp), ibd=ibd)
        return Problem(p, g, haplotype=h, info=info, error=error, frames=frames, lam=self.lam)
Beispiel #4
0
    discordant = len(np.where(ra != rb)[0])
    return (1.0 * concordant) / (concordant + discordant), concordant, discordant

def concordance_genotypes(a, b):
    '''Return the concordance between the nx2 genotype arrays a and b. Only takes into
    account entries where both genotypes are fully called.'''
    return concordance_recoded(recode_single_genotype(a), recode_single_genotype(b))

#---------------------------------------------
# PLINK- and CGI-related recoding
#---------------------------------------------
# CGI genotype coding
CGI_MISSING_LETTER = 'N'
CGI_GENOTYPES = [x[0] + x[1] for x in list(it.product(CGI_MISSING_LETTER + '01', CGI_MISSING_LETTER + '01'))]

# CGI letter to PLINK recode12 - conversion table
CGI_LETTER_TO_ALLELE = {'N': 0, '0': 1, '1': 2}
ALLELE_TO_CGI_LETTER = util.dict_invert(CGI_LETTER_TO_ALLELE)

'''Convert 0,1,2 allele values in a genotype data array to CGI letters N,0,1.'''
recode_cgi = lambda g: np.vectorize(lambda x: ALLELE_TO_CGI_LETTER[x])(g)

CGI_LETTER_TO_ALLELE_FLIPPED = {'N': 0, '0': 2, '1': 1}
ALLELE_TO_CGI_LETTER_FLIPPED = util.dict_invert(CGI_LETTER_TO_ALLELE_FLIPPED)

'''Same as recode_cgi, only that alleles are also flipped (1->2 and 2->1).'''
recode_cgi_flipped = lambda g: np.vectorize(lambda x: ALLELE_TO_CGI_LETTER_FLIPPED[x])(g)

'''Convert CGI letter genotype to PLINK 1-2 allele coding.'''
recode12 = lambda g: str(CGI_LETTER_TO_ALLELE[g[0]]) + ' ' + str(CGI_LETTER_TO_ALLELE[g[1]])
Beispiel #5
0
from utils.pointer import Pointer, OverlapError
from utils.pointer import Visit



# **** BEGIN CLASS

CC2BMS = dict_from(
    CC,
    VOLUME=0,
    PITCH=1,
    PAN=3
)   # type: Dict[CC, int]

BMS2CC = dict_invert(CC2BMS)    # type: Dict[int, CC]

# BmsTrack = None     # fixme


class EndException(Exception):
    pass


@register(0xC1)
class Child(Event):
    keys = [u8('tracknum'), u24('addr')]

    def after(self, track: BmsTrack):
        BmsTrack(track._file, self.addr, self.tracknum, None).parse()
Beispiel #6
0
def __read_pedigree(file_name, genotyped_ids=None):
    '''Load pedigree from the reads from the PLINK TFAM file file_name. If a list of genotyped ids
    genotyped_ids is specified, the pedigree node IDs are recoded to consecutive non-negative
    integers, i.e., the original plink ID list is mapped to the node list 1..(len(plink_id)).
    
    If genotype IDs are available, set node type to genotyped/not genotyped using those values;
    otherwise, fall back to the data[:,5] column.'''

    # Load data from text file_name
    data = np.genfromtxt(file_name, dtype=np.dtype(int), usecols=range(1, 6))
    nodes, missing = data[:, 0], MISSING
    num_samples, num_columns = data.shape
    if genotyped_ids is None:
        sample_id = nodes
        order = np.arange(0, num_samples)
        num_genotyped = len(sample_id)
        node_type = data[:, 5] if num_columns >= 6 else np.tile(
            constants.INDETERMINATE, (num_samples, 1))
        node_to_sample_id = None
    else:
        # Recode nodes if a genotyped ID list was specified

        # Dummy node ID for missing data in the pedigree adjacency list. Must not be a
        # possible node number (typically, a negative number should work).'''
        missing = -1
        if isinstance(genotyped_ids, np.ndarray):
            genotyped_ids = genotyped_ids.tolist()
        rest = sorted(set(nodes) - set(genotyped_ids))
        # Keep track of original IDs
        sample_id = np.array([missing] + genotyped_ids + list(rest))
        recoding = dict(
            zip(sample_id, [missing] +
                range(Pedigree.START_ID, Pedigree.START_ID + data.shape[0])))
        # Recode ID columns (first three); since array is thin and long, recode columns and
        # then transpose
        num_id_columns = 3
        data = np.concatenate(
            (np.array([[
                recoding[x] if recoding.has_key(x) else missing
                for x in data[:, j]
            ] for j in xrange(0, num_id_columns)
                       ]).transpose(), data[:, num_id_columns:]),
            axis=1)
        nodes = data[:, 0]
        # Remove the sample_id entry of the missing value
        # sample_id = sample_id[1:]
        node_to_sample_id = util.dict_invert(recoding)

        num_genotyped = len(genotyped_ids)
        node_type = np.array([(Person.TYPE.GENOTYPED if x < num_genotyped else
                               Person.TYPE.NOT_GENOTYPED)
                              for x in xrange(0, num_samples)])

    # Construct _graph
    graph = nx.DiGraph()
    # Add all nodes
    graph.add_nodes_from(nodes)
    # Add father->child edges, mother->child edges
    for (parent_type, column) in {PATERNAL: 1, MATERNAL: 2}.iteritems():
        graph.add_edges_from(data[:, (column, 0)], type=parent_type)
    # Remove missing data = edges from nodes to the dummy node 'missing'
    graph.remove_node(missing)

    if genotyped_ids is not None:
        # After graph might have reordered nodes, reorder sample_ids accordingly
        node_to_line = dict(zip(nodes, graph.nodes()))
        sample_id = np.array([node_to_sample_id[x] for x in graph.nodes()])
        order = np.array([node_to_line[x] for x in graph.nodes()])

    # Check that in-degree is at most 2
    if np.nonzero(np.array(graph.in_degree().values()) > 2)[0]:
        raise ValueError(
            'Input data contains a child node with more than two parents')

    # If genotype IDs are available, set node type to genotyped/not genotyped using those values;
    # otherwise, fall back to the data column
    return Pedigree(graph,
                    sample_id=sample_id,
                    num_genotyped=num_genotyped,
                    sex=data[order, 3],
                    phenotype=data[order, 4],
                    node_type=node_type)
Beispiel #7
0
def __read_pedigree(file_name, genotyped_ids=None):
    '''Load pedigree from the reads from the PLINK TFAM file file_name. If a list of genotyped ids
    genotyped_ids is specified, the pedigree node IDs are recoded to consecutive non-negative
    integers, i.e., the original plink ID list is mapped to the node list 1..(len(plink_id)).
    
    If genotype IDs are available, set node type to genotyped/not genotyped using those values;
    otherwise, fall back to the data[:,5] column.''' 
    
    # Load data from text file_name
    data = np.genfromtxt(file_name, dtype=np.dtype(int), usecols=range(1, 6))
    nodes, missing = data[:, 0], MISSING
    num_samples, num_columns = data.shape
    if genotyped_ids is None:
        sample_id = nodes
        order = np.arange(0, num_samples)
        num_genotyped = len(sample_id)
        node_type = data[:, 5] if num_columns >= 6 else np.tile(constants.INDETERMINATE, (num_samples, 1))
        node_to_sample_id = None
    else:
        # Recode nodes if a genotyped ID list was specified

        # Dummy node ID for missing data in the pedigree adjacency list. Must not be a
        # possible node number (typically, a negative number should work).'''
        missing = -1
        if isinstance(genotyped_ids, np.ndarray):
            genotyped_ids = genotyped_ids.tolist()
        rest = sorted(set(nodes) - set(genotyped_ids))
        # Keep track of original IDs
        sample_id = np.array([missing] + genotyped_ids + list(rest))
        recoding = dict(zip(sample_id, [missing] + range(Pedigree.START_ID, Pedigree.START_ID + data.shape[0])))
        # Recode ID columns (first three); since array is thin and long, recode columns and
        # then transpose
        num_id_columns = 3
        data = np.concatenate((np.array([[recoding[x] if recoding.has_key(x) else missing for x in data[:, j]]
                                         for j in xrange(0, num_id_columns)]).transpose(),
                               data[:, num_id_columns:]), axis=1)
        nodes = data[:, 0]
        # Remove the sample_id entry of the missing value 
        # sample_id = sample_id[1:] 
        node_to_sample_id = util.dict_invert(recoding)
        
        num_genotyped = len(genotyped_ids)
        node_type = np.array([(Person.TYPE.GENOTYPED if x < num_genotyped else Person.TYPE.NOT_GENOTYPED)
                     for x in xrange(0, num_samples)])

    # Construct _graph
    graph = nx.DiGraph()
    # Add all nodes
    graph.add_nodes_from(nodes)
    # Add father->child edges, mother->child edges
    for (parent_type, column) in {PATERNAL: 1, MATERNAL: 2}.iteritems():
        graph.add_edges_from(data[:, (column, 0)], type=parent_type)
    # Remove missing data = edges from nodes to the dummy node 'missing'
    graph.remove_node(missing)
    
    if genotyped_ids is not None:
        # After graph might have reordered nodes, reorder sample_ids accordingly
        node_to_line = dict(zip(nodes, graph.nodes()))
        sample_id = np.array([node_to_sample_id[x] for x in graph.nodes()])
        order = np.array([node_to_line[x] for x in graph.nodes()])

    # Check that in-degree is at most 2
    if np.nonzero(np.array(graph.in_degree().values()) > 2)[0]:
        raise ValueError('Input data contains a child node with more than two parents')

    # If genotype IDs are available, set node type to genotyped/not genotyped using those values;
    # otherwise, fall back to the data column
    return Pedigree(graph, sample_id=sample_id, num_genotyped=num_genotyped,
                    sex=data[order, 3], phenotype=data[order, 4], node_type=node_type)
Beispiel #8
0
 def test_dict_invert(self):
     '''Test inverting a 1:1 dictionary.'''
     d = dict(zip(range(0, 4), range(4, 8)))
     d_inv = dict(zip(range(4, 8), range(0, 4)))
     assert_equal(dict_invert(d), d_inv, 'Wrong dictionary inversion')
Beispiel #9
0
 def test_dict_invert(self):
     '''Test inverting a 1:1 dictionary.'''
     d = dict(zip(range(0, 4), range(4, 8)))
     d_inv = dict(zip(range(4, 8), range(0, 4)))
     assert_equal(dict_invert(d), d_inv, 'Wrong dictionary inversion')