Example #1
0
 def _test_complete_genotype_partial(self, h, h_expected):
     """Test completing a haplotype with one known entry vs. a partially-called genotype. Comprehensive
     checks for a single h and all possible genotypes g."""
     g = np.array(list(it.product(xrange(3), xrange(3))))
     h_temp = np.tile(h, (9, 1)).copy()
     gt.complete_haplotype_partial(h_temp, g)
     assert_equal(h_temp, h_expected, "Unexpected haplotype completion")
Example #2
0
 def _test_complete_genotype_partial(self, h, h_expected):
     '''Test completing a haplotype with one known entry vs. a partially-called genotype. Comprehensive
     checks for a single h and all possible genotypes g.'''
     g = np.array(list(it.product(xrange(3), xrange(3))))
     h_temp = np.tile(h, (9, 1)).copy()
     gt.complete_haplotype_partial(h_temp, g)
     assert_equal(h_temp, h_expected, 'Unexpected haplotype completion')
Example #3
0
    def impute(self, samples=None):
        '''Infer imputed genotypes at all samples of h from the samples of g.'''
        # Aliases
        r = self.ratio_threshold
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index:
                print 'Initial g[%d] = %s, h[%d] = %s' % (
                    self.training_index[j], repr(
                        self.g[self.training_index[j]]), j, repr(self.h[j]))
            else:
                print 'Initial h[%d] = %s' % (j, repr(self.h[j]))

        # Create a queue of haplotypes that can be used to impute others. Initially, it
        # is the training set homozygotes; every time we phase a training set het, its other
        # allele is appended to the queue.
        q = Queue()

        # Initial condition: phase all hom training samples
        hom = self.__phase_hom()
        for hap in itertools.product(hom, ALLELES):
            if self.debug:
                print 'Adding hom haplotype to queue', hap
            q.put(hap)
        num_hom_haps = q.qsize()
        if self.debug:
            print 'Items on queue : %d' % (q.qsize(), )
            print 'filled haps    : %.2f%%' % (
                (100. * len(self.h.nonzero()[0])) / self.h.size, )
            HH = recode.recode_single_genotype(self.h)
            print 'filled samples : %.2f%%' % (
                (100. * len(np.where(HH > 0)[0])) / HH.size, )
            print 'phased training: %.2f%%' % (
                (100. * len(self.h[self.training_sample_index].nonzero()[0])) /
                self.h[self.training_sample_index].size, )
        count = 0
        while not q.empty():
            # Get the IBD group of an imputed haplotype (all haps that are IBD with it at self.snp)
            hap = q.get()
            count += 1
            if count > self.max_iter:
                raise ValueError(
                    'Possible imputation bug - exceeded max number of iterations'
                )
            if self.debug:
                print '*' * 55
                print 'Iteration %d, imputing from hap %s' % (count, hap)
                print '*' * 55
                if self.debug_sample >= 0:
                    j = self.debug_sample
                    print 'h[%d] = %s' % (j, repr(self.h[j]))
                    if self.h[j, 0] == 1:
                        pass
            group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE],
                                  hap[ALLELE])
            if group.size:
                # if self.debug: print 'group', group
                s, a = group[:, SAMPLE], group[:, ALLELE]
                H = self.h[s, a]
                #                    print 'H', H
                # Find haplotypes that have been imputed with each of the alleles
                R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)]
                if self.debug:
                    print 'IBD group %d (%d haps):\n%s' % (
                        self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE],
                                             hap[ALLELE]), len(group),
                        repr(
                            np.concatenate(
                                (group, H[np.newaxis].transpose()), axis=1)))
                    print 'R1 = %s' % (repr(list(map(tuple, R1))), )
                    print 'R2 = %s' % (repr(list(map(tuple, R2))), )
#                    print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1)))
#                    print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1)))
# Majority vote: if there are enough haps with one allele, override the rest.
# Otherwise, an unresolved conflict ==> zero everyone out.
                l1, l2 = len(R1), len(R2)
                consensus = 1 if l1 >= r * l2 else (
                    2 if l2 >= r * l1 else MISSING)
                self.h[s, a] = consensus
                if consensus == 0:
                    # If no consensus is reached, keep the already-imputed values in place, otherwise
                    # we can run into an infinite loop by imputing and erasing h-entries.
                    self.h[R1[:, 0], R1[:, 1]] = 1
                    self.h[R2[:, 0], R2[:, 1]] = 2
                H = self.h[s]
                if self.debug:
                    print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus)
                    print 'Items on queue : %d' % (q.qsize(), )
                    print 'filled haps    : %.2f%%' % (
                        (100. * len(self.h.nonzero()[0])) / self.h.size, )
                    HH = recode.recode_single_genotype(self.h)
                    print 'filled samples : %.2f%%' % (
                        (100. * len(np.where(HH > 0)[0])) / HH.size, )
                    print 'phased training: %.2f%%' % (
                        (100. * len(
                            self.h[self.training_sample_index].nonzero()[0])) /
                        self.h[self.training_sample_index].size, )

                # Phase training hets (this includes BOTH partially-called = potential hets an
                # fully-called hets) with one imputed allele
                i = np.array([self.training_index.has_key(x) for x in s])
                si = s[i]
                G = self.g[map(self.training_index.get, si), :]
                unphased_hets = np.where(
                    ((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING))
                    & ((G[:, PATERNAL] != MISSING)
                       | (G[:, MATERNAL] != MISSING)
                       & (G[:, PATERNAL] != G[:, MATERNAL])))
                if unphased_hets[0].size:
                    if self.debug:
                        if count >= num_hom_haps:
                            print 'After hom, items on queue %d' % (
                                q.qsize(), )
                            pass
                        print 'unphased_hets', unphased_hets
                        print 'si', si
                        print 'index i[unphased_hets]', np.where(
                            i)[0][unphased_hets]
                        print 'H_unphased', H[i][unphased_hets]
                        print 'G_unphased', G[unphased_hets]
                    H_unphased = H[i][unphased_hets]
                    H_phased = H_unphased.copy()
                    complete_haplotype_partial(H_phased, G[unphased_hets])
                    #                    if self.debug:
                    #                        print 'Phasing hets'
                    #                        print 'i', np.where(i)
                    #                        print 'H_unphased', H_unphased
                    #                        print 'G of unphased_hets', G[unphased_hets]
                    newly_phased_alleles = np.where(H_phased != H_unphased)[1]
                    self.h[si[unphased_hets]] = H_phased[:]
                    #                    if self.debug:
                    #                        print 'After phasing H_unphased', self.h[s[unphased_hets]]
                    #                        print 'unphased_hets', s[unphased_hets]
                    #                        print 'newly_phased_alleles', newly_phased_alleles
                    # Append the new data we can now make use of to the queue
                    if self.debug:
                        print 'After phasing them'
                        print 'H_phased  ', H_phased
                    for hap in zip(si[unphased_hets], newly_phased_alleles):
                        if self.debug:
                            print 'Adding phased het haplotypes to queue', hap
                        q.put(hap)
        self.__override_training_imputed_by_genotypes()
        if self.remove_partial_calls: self.__remove_partial_calls()
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index:
                print 'Final g[%d] = %s, h[%d] = %s' % (
                    self.training_index[j], repr(
                        self.g[self.training_index[j]]), j, repr(self.h[j]))
            else:
                print 'h[%d] = %s' % (j, repr(self.h[j]))
Example #4
0
File: qc.py Project: orenlivne/ober
class _IbdQc(object):
    '''Calculates a QC measure for a single variant using IBD cliques.'''
    
    #---------------------------------------------
    # Constants
    #---------------------------------------------
    __EMPTY_ARRAY = np.array([])
    
    #---------------------------------------------
    # Constructors
    #---------------------------------------------
    def __init__(self, h, hap_type, ibd, g, training_sample_index, chrom, snp_bp, debug=False, majority_threshold=0.66,
                 debug_sample= -1, max_iter=1000):
        '''Initialize an imputer that changes the result h in-place using the IBD index ibd
        and training genotype data g at snp position snp_bp. majority vote = threshold for majority vote. When
        |# haps with majority allele| >= majority_threshold*(All haps) in a clique, the vote is accepted.''' 

        # Input fields
        self.h, self.hap_type, self.g, self.ibd, self.training_sample_index, self.max_iter, self.debug, \
        self.debug_sample, self.chrom = h, hap_type, g, ibd, training_sample_index, max_iter, debug, \
        debug_sample, chrom

        # Maps sample ID to training set index
        self.training_index = dict(zip(self.training_sample_index, xrange(self.g.shape[0])))
        self.ratio_threshold = majority_threshold / (1 - majority_threshold)

        # Find the appropriate IBD index SNP for the target base-pair position
        self.snp = ibd.nearest_left_snp(chrom, snp_bp)
        if self.debug:
            ibd.find(self.chrom, self.snp, self.training_sample_index[0], PATERNAL)
            print 'IBD index file %s/chr%d/region-%d.npz, nearest SNP %d' % (ibd._index_dir, ibd._chrom, ibd._start, self.snp)
        
    #---------------------------------------------
    # Methods
    #---------------------------------------------                 
    def impute(self, samples=None):
        '''Infer imputed genotypes at all samples of h from the samples of g.'''
        # Aliases
        r = self.ratio_threshold
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index: print 'Initial g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j]))
            else: print 'Initial h[%d] = %s' % (j, repr(self.h[j]))
        
        # Create a queue of haplotypes that can be used to impute others. Initially, it
        # is the training set homozygotes; every time we phase a training set het, its other
        # allele is appended to the queue.
        #
        # TODO: possibly replace by a priority queue where alleles are ordered by their clique sizes?
        # (we have extra confidence in those alleles; not sure it matters though) 
        q = Queue()

        # Initial condition: phase all hom training samples         
        hom = self.__phase_hom()
        for hap in itertools.product(hom, ALLELES):
            if self.debug: print 'Adding hom haplotype to queue', hap
            q.put(hap)
        num_hom_haps = q.qsize()
        if self.debug:
            print 'Items on queue : %d' % (q.qsize(),)
            print 'filled haps    : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,)
            HH = recode.recode_single_genotype(self.h)
            print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,)
            print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,)
        count = 0
        while not q.empty():
            # Find group = the IBD clique of an imputed haplotype (all haps that are IBD with it at self.snp)
            hap = q.get()
            count += 1
            if count > self.max_iter: raise ValueError('Possible imputation bug - exceeded maximum number of iterations')
            if self.debug:
                print '*' * 55
                print 'Iteration %d, imputing from hap %s' % (count, hap)
                print '*' * 55
                if self.debug_sample >= 0:
                    j = self.debug_sample
                    print 'h[%d] = %s' % (j, repr(self.h[j]))
                    if self.h[j, 0] == 1:
                        pass
            group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE])
            if group.size:
                # if self.debug: print 'group', group
                s, a = group[:, SAMPLE], group[:, ALLELE]
                H = self.h[s, a]
#                    print 'H', H
                # Find haplotypes that have been imputed as allele 1 and those imputed as allele 2
                R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)]
                if self.debug:
                    print 'IBD group %d (%d haps):\n%s' % (self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]), len(group), repr(np.concatenate((group, H[np.newaxis].transpose()), axis=1)))
                    print 'R1 = %s' % (repr(list(map(tuple, R1))),)
                    print 'R2 = %s' % (repr(list(map(tuple, R2))),)
#                    print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1)))
#                    print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1)))
                # Majority vote: if there are enough haps with one allele, override the rest.
                # Otherwise, an unresolved conflict ==> zero everyone out.
                l1, l2 = len(R1), len(R2)
                consensus = 1 if l1 >= r * l2 else (2 if l2 >= r * l1 else MISSING)
                self.h[s, a] = consensus
                if consensus == 0:
                    # If no consensus is reached, keep the already-imputed values in place, otherwise
                    # we can run into an infinite loop by imputing and erasing h-entries.  
                    self.h[R1[:, 0], R1[:, 1]] = 1
                    self.h[R2[:, 0], R2[:, 1]] = 2
                H = self.h[s]
                if self.debug:
                    print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus)
                    print 'Items on queue : %d' % (q.qsize(),)
                    print 'filled haps    : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,)
                    HH = recode.recode_single_genotype(self.h)
                    print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,)
                    print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,)
                
                # Phase training hets (this includes BOTH partially-called = potential hets an
                # fully-called hets) with one imputed allele
                i = np.array([self.training_index.has_key(x) for x in s])
                si = s[i]
                G = self.g[map(self.training_index.get, si), :]
                unphased_hets = np.where(((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING)) 
                                         & ((G[:, PATERNAL] != MISSING) | (G[:, MATERNAL] != MISSING)
                                            & (G[:, PATERNAL] != G[:, MATERNAL])))
                if unphased_hets[0].size:
                    if self.debug:
                        if count >= num_hom_haps:
                            print 'After hom, items on queue %d' % (q.qsize(),)
                            pass
                        print 'unphased_hets', unphased_hets
                        print 'si', si
                        print 'index i[unphased_hets]', np.where(i)[0][unphased_hets]
                        print 'H_unphased', H[i][unphased_hets]
                        print 'G_unphased', G[unphased_hets]
                    H_unphased = H[i][unphased_hets]
                    H_phased = H_unphased.copy()
                    complete_haplotype_partial(H_phased, G[unphased_hets])
#                    if self.debug:
#                        print 'Phasing hets'
#                        print 'i', np.where(i) 
#                        print 'H_unphased', H_unphased
#                        print 'G of unphased_hets', G[unphased_hets]
                    newly_phased_alleles = np.where(H_phased != H_unphased)[1]
                    self.h[si[unphased_hets]] = H_phased[:]
#                    if self.debug:
#                        print 'After phasing H_unphased', self.h[s[unphased_hets]]
#                        print 'unphased_hets', s[unphased_hets]
#                        print 'newly_phased_alleles', newly_phased_alleles
                    # Append the new data we can now make use of to the queue
                    if self.debug:
                        print 'After phasing them'
                        print 'H_phased  ', H_phased
                    for hap in zip(si[unphased_hets], newly_phased_alleles):
                        if self.debug:
                            print 'Adding phased het haplotypes to queue', hap
                        q.put(hap)
        self.__override_training_imputed_by_genotypes()
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index: print 'Final g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j]))
            else: print 'h[%d] = %s' % (j, repr(self.h[j]))
Example #5
0
    def impute(self, samples=None):
        '''Infer imputed genotypes at all samples of h from the samples of g.'''
        # Aliases
        r = self.ratio_threshold
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index:
                print 'Initial g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j]))
            else:
                print 'Initial h[%d] = %s' % (j, repr(self.h[j]))
        
        # Create a queue of haplotypes that can be used to impute others. Initially, it
        # is the training set homozygotes; every time we phase a training set het, its other
        # allele is appended to the queue.
        q = Queue()

        # Initial condition: phase all hom training samples         
        hom = self.__phase_hom()
        for hap in itertools.product(hom, ALLELES):
            if self.debug:
                print 'Adding hom haplotype to queue', hap
            q.put(hap)
        num_hom_haps = q.qsize()
        if self.debug:
            print 'Items on queue : %d' % (q.qsize(),)
            print 'filled haps    : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,)
            HH = recode.recode_single_genotype(self.h)
            print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,)
            print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,)
        count = 0
        while not q.empty():
            # Get the IBD group of an imputed haplotype (all haps that are IBD with it at self.snp)
            hap = q.get()
            count += 1
            if count > self.max_iter:
                raise ValueError('Possible imputation bug - exceeded max number of iterations')
            if self.debug:
                print '*' * 55
                print 'Iteration %d, imputing from hap %s' % (count, hap)
                print '*' * 55
                if self.debug_sample >= 0:
                    j = self.debug_sample
                    print 'h[%d] = %s' % (j, repr(self.h[j]))
                    if self.h[j, 0] == 1:
                        pass
            group = self.ibd.find(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE])
            if group.size:
                # if self.debug: print 'group', group
                s, a = group[:, SAMPLE], group[:, ALLELE]
                H = self.h[s, a]
#                    print 'H', H
                # Find haplotypes that have been imputed with each of the alleles
                R1, R2 = group[np.where(H == 1)], group[np.where(H == 2)]
                if self.debug:
                    print 'IBD group %d (%d haps):\n%s' % (self.ibd.group_index(self.chrom, self.snp, hap[SAMPLE], hap[ALLELE]), len(group), repr(np.concatenate((group, H[np.newaxis].transpose()), axis=1)))
                    print 'R1 = %s' % (repr(list(map(tuple, R1))),)
                    print 'R2 = %s' % (repr(list(map(tuple, R2))),)
#                    print 'R1:\n%s' % (repr(np.concatenate((R1, self.h[R1[:, 0], R1[:, 1]][np.newaxis].transpose()), axis=1)))
#                    print 'R2:\n%s' % (repr(np.concatenate((R2, self.h[R2[:, 0], R2[:, 1]][np.newaxis].transpose()), axis=1)))
                # Majority vote: if there are enough haps with one allele, override the rest.
                # Otherwise, an unresolved conflict ==> zero everyone out.
                l1, l2 = len(R1), len(R2)
                consensus = 1 if l1 >= r * l2 else (2 if l2 >= r * l1 else MISSING)
                self.h[s, a] = consensus
                if consensus == 0:
                    # If no consensus is reached, keep the already-imputed values in place, otherwise
                    # we can run into an infinite loop by imputing and erasing h-entries.  
                    self.h[R1[:, 0], R1[:, 1]] = 1
                    self.h[R2[:, 0], R2[:, 1]] = 2
                H = self.h[s]
                if self.debug:
                    print 'l1 %d l2 %d consensus %d' % (l1, l2, consensus)
                    print 'Items on queue : %d' % (q.qsize(),)
                    print 'filled haps    : %.2f%%' % ((100.*len(self.h.nonzero()[0])) / self.h.size,)
                    HH = recode.recode_single_genotype(self.h)
                    print 'filled samples : %.2f%%' % ((100.*len(np.where(HH > 0)[0])) / HH.size,)
                    print 'phased training: %.2f%%' % ((100.*len(self.h[self.training_sample_index].nonzero()[0])) / self.h[self.training_sample_index].size,)
                
                # Phase training hets (this includes BOTH partially-called = potential hets an
                # fully-called hets) with one imputed allele
                i = np.array([self.training_index.has_key(x) for x in s])
                si = s[i]
                G = self.g[map(self.training_index.get, si), :]
                unphased_hets = np.where(((H[i, PATERNAL] != MISSING) ^ (H[i, MATERNAL] != MISSING)) 
                                         & ((G[:, PATERNAL] != MISSING) | (G[:, MATERNAL] != MISSING)
                                            & (G[:, PATERNAL] != G[:, MATERNAL])))
                if unphased_hets[0].size:
                    if self.debug:
                        if count >= num_hom_haps:
                            print 'After hom, items on queue %d' % (q.qsize(),)
                            pass
                        print 'unphased_hets', unphased_hets
                        print 'si', si
                        print 'index i[unphased_hets]', np.where(i)[0][unphased_hets]
                        print 'H_unphased', H[i][unphased_hets]
                        print 'G_unphased', G[unphased_hets]
                    H_unphased = H[i][unphased_hets]
                    H_phased = H_unphased.copy()
                    complete_haplotype_partial(H_phased, G[unphased_hets])
#                    if self.debug:
#                        print 'Phasing hets'
#                        print 'i', np.where(i) 
#                        print 'H_unphased', H_unphased
#                        print 'G of unphased_hets', G[unphased_hets]
                    newly_phased_alleles = np.where(H_phased != H_unphased)[1]
                    self.h[si[unphased_hets]] = H_phased[:]
#                    if self.debug:
#                        print 'After phasing H_unphased', self.h[s[unphased_hets]]
#                        print 'unphased_hets', s[unphased_hets]
#                        print 'newly_phased_alleles', newly_phased_alleles
                    # Append the new data we can now make use of to the queue
                    if self.debug:
                        print 'After phasing them'
                        print 'H_phased  ', H_phased
                    for hap in zip(si[unphased_hets], newly_phased_alleles):
                        if self.debug:
                            print 'Adding phased het haplotypes to queue', hap
                        q.put(hap)
        self.__override_training_imputed_by_genotypes()
        if self.remove_partial_calls: self.__remove_partial_calls()
        if self.debug_sample >= 0:
            j = self.debug_sample
            if j in self.training_index:
                print 'Final g[%d] = %s, h[%d] = %s' % (self.training_index[j], repr(self.g[self.training_index[j]]), j, repr(self.h[j]))
            else:
                print 'h[%d] = %s' % (j, repr(self.h[j]))