Beispiel #1
0
 def test_first_occurrence_index_int8(self):
     """Test finding the index of the first occurrence of a value in an array."""
     a = np.array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0]).astype("int8")
     assert_equal(first_occurrence_index_byte(a, 1, 0, 1), 3, "Wrong index finding, forward")
     assert_equal(first_occurrence_index_byte(a, 1, len(a) - 1, -1), 7, "Wrong index finding, backward")
     assert_equal(first_occurrence_index_byte(a, 1, 5, -1), 3, "Wrong index finding, from middle backward")
     assert_equal(first_occurrence_index_byte(a, 1, 5, 1), 7, "Wrong index finding, from middle forward")
     assert_equal(
         first_occurrence_index_byte(a, 10, 0, 1), -1, "Wrong index finding, from middle forward, value not in array"
     )
Beispiel #2
0
 def _ibs_segment_len(self, d, k):
     '''Find the largest segment of consecutive zeros containing k. Returning the
     length in bp. snp is the snp index array.'''
     left = first_occurrence_index_byte(d, 1, k - 1, -1)
     left = left if left >= 0 else 0
     right = first_occurrence_index_byte(d, 1, k, 1)
     right = right if right >= 0 else len(d) - 1
     L = self.bp[right] - self.bp[left]
     #if self.debug:
     #    print '\t', 'k', k, 'd around k', d[k - 10:k + 10]
     #    print '\t', left, right, self.bp[left], self.bp[right], L
     return left, right, L
Beispiel #3
0
 def _ibs_segment_len(self, d, k):
     '''Find the largest segment of consecutive zeros containing k. Returning the
     length in bp. snp is the snp index array.'''
     left = first_occurrence_index_byte(d, 1, k - 1, -1)
     left = left if left >= 0 else 0
     right = first_occurrence_index_byte(d, 1, k, 1)
     right = right if right >= 0 else len(d) - 1
     L = self.bp[right] - self.bp[left]
     #if self.debug:
     #    print '\t', 'k', k, 'd around k', d[k - 10:k + 10]
     #    print '\t', left, right, self.bp[left], self.bp[right], L
     return left, right, L
Beispiel #4
0
def _largest_frame(problem, data_exist, debug=False):
    '''Find Largest intersection of an independent SNP frame and the data_exist array.'''
    # Fetch frames of the chromosome in question. Assuming all SNPs are on the same chromosome.
    where_data_exist = np.where(data_exist)[0]
    frames = problem.frames[problem.info.snp['chrom'][0]]
    pair_frames = [np.intersect1d(frame, where_data_exist) for frame in frames]
    frame_number = np.argmax(np.array([len(x) for x in pair_frames]))
    frame_size = len(frame)
    # Add the first and last SNP in the domain that have i,j data for full frame coverage
    start = first_occurrence_index_byte(data_exist, True, 0, 1)
    stop = first_occurrence_index_byte(data_exist, True, len(data_exist) - 1, -1) 
    frame = np.sort(np.union1d([start, stop], pair_frames[frame_number]))
    if debug:
        print 'Frame lengths', [len(x) for x in frames]
        print 'Frame #', frame_number, ', size', frame_size, 'start', start, 'stop', stop, frame.tolist()
    return frame
Beispiel #5
0
def _largest_frame(problem, data_exist, debug=False):
    '''Find Largest intersection of an independent SNP frame and the data_exist array.'''
    # Fetch frames of the chromosome in question. Assuming all SNPs are on the same chromosome.
    where_data_exist = np.where(data_exist)[0]
    frames = problem.frames[problem.info.snp['chrom'][0]]
    pair_frames = [np.intersect1d(frame, where_data_exist) for frame in frames]
    frame_number = np.argmax(np.array([len(x) for x in pair_frames]))
    frame_size = len(frame)
    # Add the first and last SNP in the domain that have i,j data for full frame coverage
    start = first_occurrence_index_byte(data_exist, True, 0, 1)
    stop = first_occurrence_index_byte(data_exist, True,
                                       len(data_exist) - 1, -1)
    frame = np.sort(np.union1d([start, stop], pair_frames[frame_number]))
    if debug:
        print 'Frame lengths', [len(x) for x in frames]
        print 'Frame #', frame_number, ', size', frame_size, 'start', start, 'stop', stop, frame.tolist(
        )
    return frame
Beispiel #6
0
    def __segments(self, j, is_i_phased, use_kinship=True):
        '''Yield IBD segments between a sample i and a phased, POO-determined relative j.
         
        Supports minimum required IBD segment length vs. d meioses separating two samples. Assumes
        an exponential distribution and computes the value x for which P(x >= X) = ibd_length_upper_percentile.
        
        Important: use a short median filter window size (3-5) to catch more spikes since we are
        less certain that spikes are genotype errors than in parent-child IBD calculations
        
        If is_i_phased = True, will try to match i's haplotypes against j's. Otherwise, will match
        one of i's haps against j's genotype (assuming both i's haps are equal, and phased only at hom SNPs).  
        '''
        g, h = self.problem.data
        is_i_phased = (self.het_fill_fraction > self.params.het_fill_threshold)
        
        # Kinships between i's parents and j's parents for parent-of-origin delineation in the produced
        # segments (we need to know which of i's haplotypes is IBD with the j haplotype that fits an IBD=1
        # segment)
        sample_id = self.problem.pedigree.sample_id
        if use_kinship:
            predecessors = self.problem.pedigree.graph.predecessors
            i_parents_ids = [sample_id[predecessors(self.i)[a]] for a in ALLELES]
            j_parents_ids = [sample_id[predecessors(j)[a]] for a in ALLELES]
            K = np.array([[self.params.kinship(x, y) for y in i_parents_ids] for x in j_parents_ids])
        if self.params.debug:            
            print '-' * 70
            print 'IBD Segments between i and a surrogate parent j'
            print 'i=%d ID %8d fill %.3f%% Phased? %s' % \
            (self.i, sample_id[self.i], 100.*self.problem.fill_fraction_of_sample(self.i), "yes" if is_i_phased else "no") 
            print 'j=%d ID %8d fill %.3f%%' % (j, sample_id[j], 100.*self.problem.fill_fraction_of_sample(j))
            u, d = im.pt.lowest_common_ancestor(self.problem.pedigree.graph, self.i, j)
            if u:
                print 'Lowest-common ancestor: %d, depth=%d' % (u, d)
            print '-' * 70
            print 'Frame lengths', [len(x) for x in self.frames]

        #--------------------------------------------
        # Pick a frame of independent SNPs 
        #--------------------------------------------
        # SNPs with full Gi, Gj data
        mask_gi_gj_exist = self.gi_exists & im.recode.filled_genotype(g[:, j])
        gi_gj_exist = np.where(mask_gi_gj_exist)[0]
        # Find Largest-intersecting independent SNP frame
        pair_frames = [np.intersect1d(frame, gi_gj_exist) for frame in self.frames]
        frame_number = np.argmax(np.array([len(x) for x in pair_frames]))
        frame_size = len(frame)
        # Add the first and last SNP in the domain that have i,j data for full frame coverage
        start = first_occurrence_index_byte(mask_gi_gj_exist, True, 0, 1)
        stop = first_occurrence_index_byte(mask_gi_gj_exist, True, len(gi_gj_exist) - 1, -1) 
        frame = np.sort(np.union1d([start, stop], pair_frames[frame_number]))
        if self.params.debug:
            print 'Frame #', frame_number, ', size', frame_size, 'start', start, 'stop', stop, frame.tolist()
        
        #--------------------------------------------
        # Calculate IBD posterior and IBD mask 
        #--------------------------------------------
        prob_ibd = self.prob_ibd_calculator(self.problem, self.i, j, frame, self.params)
        is_ibd = prob_ibd > 0.0
        if self.params.debug:
            # print 'IBD mask', is_ibd.astype(np.uint)
            print 'Raw IBD segments', [(frame[x[START]], frame[x[STOP]] if x[STOP] < len(frame) else self.num_snps)
                                       for x in ibd.segment.segments_with_value(is_ibd, True, 3)] 
            
        #--------------------------------------------
        # Calculate Haplotype matching mask 
        #--------------------------------------------
        # Interpolate prob_ibd from frame SNPs to all SNPs (0 outside IBD segments; linear interpolation
        # within segments). Interpolate mask_is_ibd (piecewise constant: 1 within segments, 0 outside).
        mask_prob_ibd = np.zeros((self.num_snps,), dtype=np.float)
        mask_is_ibd = np.zeros((self.num_snps,), dtype=np.bool)
        for fr, full_segment in ((fr, self.__frame_segment_to_snp(frame, fr))
                                 for fr in ibd.segment.segments_with_value(is_ibd, True)):
            start, stop = full_segment[START], full_segment[STOP]  # Original indices of segment boundaries
            fr_start, fr_stop = fr[START], fr[STOP]  # Frame indices of segment boundaries
            mask_prob_ibd[start:stop] = np.interp(self.cm[start:stop], self.cm[frame[fr_start:fr_stop]], prob_ibd[fr_start:fr_stop])
            mask_is_ibd[start:stop] = True
        
        # i is phased ==> calculate difference mask between each i-hap and j-hap
        # i is not phased ==> calculate difference between i's genotype and each j-hap. Output a dummy i-hap in segments
        mask_is_hap_fit = ((im.diff.all_diffs(h, self.i, j) if is_i_phased else \
        np.array([im.recode.ibs_diff_gh(g[:, self.i], h[:, j, j_allele]) for j_allele in ALLELES])) <= 0)
        mask = mask_is_ibd & mask_is_hap_fit
        
        # filtered_mask = ibd.filter_diff(mask, 'median', 5)
        # Filter segments based on length
        ibd_segments = SegmentSet([])
        # If use_kinship=True and i is unphased, find which of i's parents has higher kinship to the
        # j_allele-parent of j. Assign the segment to be between the corresponding i and j haplotypes.
        #
        # If use_kinship=False, assign the IBD segment to an arbitary haplotype (PATERNAL). Inferior
        # and should only be used during debugging. 
        for k, (i_allele, j_allele) in enumerate([(i_allele, j_allele) 
                                                  for i_allele in (ALLELES if is_i_phased else [np.argmax(K[:, j_allele]) if use_kinship else PATERNAL]) 
                                                  for j_allele in ALLELES]):
            mask_pair = mask[k]
            filtered_mask = ibd.filter_diff(mask_pair, 'median', 5)
            error = (filtered_mask != mask_pair)
            # Output only segments of 1's of at least a trivially-small length
            # print filtered_mask
            segments = ibd.segment.segments_with_value(filtered_mask, True, 3)  
            if self.params.debug:
                print '--- IBD segments between (%d,%d), (%d,%d)' % (self.i, i_allele, j, j_allele)
            min_len = self.params.min_len  # @UnusedVariable
            len_unit = self.params.len_unit
            long_segments = [Segment(s, [(self.i, i_allele), (j, j_allele)], (bp_start, bp_stop),
                                     error_snps=np.where(ibd.segment.is_in_segment(error, s)),
                                     confidence=mask_prob_ibd[s[START]:s[STOP]],
                                     cm=(cm_start, cm_stop), collapse_to_set=False)
                             for s, bp_start, bp_stop, cm_start, cm_stop in
                             ((t,
                               self.bp[t[START]], im.segment.stop_bp(self.bp, t[STOP], self.num_snps),
                               self.cm[t[START]], im.segment.stop_bp(self.cm, t[STOP], self.num_snps)
                               ) for t in segments)
                             if ((cm_stop - cm_start >= min_len) if (len_unit == 'cm') else (bp_stop - bp_start >= im.constants.MEGA_BASE_PAIR * min_len))]
            ibd_segments += long_segments
            if self.params.debug:
                print 'Long segments (>= %.2f %s):' % (min_len, len_unit)
                print '\n'.join('\t' + repr(segment) for segment in long_segments) if long_segments else '\t-'
                print ''
        return ibd_segments
Beispiel #7
0
    def __segments(self, j, is_i_phased, use_kinship=True):
        '''Yield IBD segments between a sample i and a phased, POO-determined relative j.
         
        Supports minimum required IBD segment length vs. d meioses separating two samples. Assumes
        an exponential distribution and computes the value x for which P(x >= X) = ibd_length_upper_percentile.
        
        Important: use a short median filter window size (3-5) to catch more spikes since we are
        less certain that spikes are genotype errors than in parent-child IBD calculations
        
        If is_i_phased = True, will try to match i's haplotypes against j's. Otherwise, will match
        one of i's haps against j's genotype (assuming both i's haps are equal, and phased only at hom SNPs).  
        '''
        g, h = self.problem.data
        is_i_phased = (self.het_fill_fraction > self.params.het_fill_threshold)

        # Kinships between i's parents and j's parents for parent-of-origin delineation in the produced
        # segments (we need to know which of i's haplotypes is IBD with the j haplotype that fits an IBD=1
        # segment)
        sample_id = self.problem.pedigree.sample_id
        if use_kinship:
            predecessors = self.problem.pedigree.graph.predecessors
            i_parents_ids = [
                sample_id[predecessors(self.i)[a]] for a in ALLELES
            ]
            j_parents_ids = [sample_id[predecessors(j)[a]] for a in ALLELES]
            K = np.array([[self.params.kinship(x, y) for y in i_parents_ids]
                          for x in j_parents_ids])
        if self.params.debug:
            print '-' * 70
            print 'IBD Segments between i and a surrogate parent j'
            print 'i=%d ID %8d fill %.3f%% Phased? %s' % \
            (self.i, sample_id[self.i], 100.*self.problem.fill_fraction_of_sample(self.i), "yes" if is_i_phased else "no")
            print 'j=%d ID %8d fill %.3f%%' % (
                j, sample_id[j],
                100. * self.problem.fill_fraction_of_sample(j))
            u, d = im.pt.lowest_common_ancestor(self.problem.pedigree.graph,
                                                self.i, j)
            if u:
                print 'Lowest-common ancestor: %d, depth=%d' % (u, d)
            print '-' * 70
            print 'Frame lengths', [len(x) for x in self.frames]

        #--------------------------------------------
        # Pick a frame of independent SNPs
        #--------------------------------------------
        # SNPs with full Gi, Gj data
        mask_gi_gj_exist = self.gi_exists & im.recode.filled_genotype(g[:, j])
        gi_gj_exist = np.where(mask_gi_gj_exist)[0]
        # Find Largest-intersecting independent SNP frame
        pair_frames = [
            np.intersect1d(frame, gi_gj_exist) for frame in self.frames
        ]
        frame_number = np.argmax(np.array([len(x) for x in pair_frames]))
        frame_size = len(frame)
        # Add the first and last SNP in the domain that have i,j data for full frame coverage
        start = first_occurrence_index_byte(mask_gi_gj_exist, True, 0, 1)
        stop = first_occurrence_index_byte(mask_gi_gj_exist, True,
                                           len(gi_gj_exist) - 1, -1)
        frame = np.sort(np.union1d([start, stop], pair_frames[frame_number]))
        if self.params.debug:
            print 'Frame #', frame_number, ', size', frame_size, 'start', start, 'stop', stop, frame.tolist(
            )

        #--------------------------------------------
        # Calculate IBD posterior and IBD mask
        #--------------------------------------------
        prob_ibd = self.prob_ibd_calculator(self.problem, self.i, j, frame,
                                            self.params)
        is_ibd = prob_ibd > 0.0
        if self.params.debug:
            # print 'IBD mask', is_ibd.astype(np.uint)
            print 'Raw IBD segments', [
                (frame[x[START]],
                 frame[x[STOP]] if x[STOP] < len(frame) else self.num_snps)
                for x in ibd.segment.segments_with_value(is_ibd, True, 3)
            ]

        #--------------------------------------------
        # Calculate Haplotype matching mask
        #--------------------------------------------
        # Interpolate prob_ibd from frame SNPs to all SNPs (0 outside IBD segments; linear interpolation
        # within segments). Interpolate mask_is_ibd (piecewise constant: 1 within segments, 0 outside).
        mask_prob_ibd = np.zeros((self.num_snps, ), dtype=np.float)
        mask_is_ibd = np.zeros((self.num_snps, ), dtype=np.bool)
        for fr, full_segment in ((fr, self.__frame_segment_to_snp(
                frame,
                fr)) for fr in ibd.segment.segments_with_value(is_ibd, True)):
            start, stop = full_segment[START], full_segment[
                STOP]  # Original indices of segment boundaries
            fr_start, fr_stop = fr[START], fr[
                STOP]  # Frame indices of segment boundaries
            mask_prob_ibd[start:stop] = np.interp(
                self.cm[start:stop], self.cm[frame[fr_start:fr_stop]],
                prob_ibd[fr_start:fr_stop])
            mask_is_ibd[start:stop] = True

        # i is phased ==> calculate difference mask between each i-hap and j-hap
        # i is not phased ==> calculate difference between i's genotype and each j-hap. Output a dummy i-hap in segments
        mask_is_hap_fit = ((im.diff.all_diffs(h, self.i, j) if is_i_phased else \
        np.array([im.recode.ibs_diff_gh(g[:, self.i], h[:, j, j_allele]) for j_allele in ALLELES])) <= 0)
        mask = mask_is_ibd & mask_is_hap_fit

        # filtered_mask = ibd.filter_diff(mask, 'median', 5)
        # Filter segments based on length
        ibd_segments = SegmentSet([])
        # If use_kinship=True and i is unphased, find which of i's parents has higher kinship to the
        # j_allele-parent of j. Assign the segment to be between the corresponding i and j haplotypes.
        #
        # If use_kinship=False, assign the IBD segment to an arbitary haplotype (PATERNAL). Inferior
        # and should only be used during debugging.
        for k, (i_allele,
                j_allele) in enumerate([(i_allele, j_allele) for i_allele in (
                    ALLELES if is_i_phased else
                    [np.argmax(K[:, j_allele]) if use_kinship else PATERNAL])
                                        for j_allele in ALLELES]):
            mask_pair = mask[k]
            filtered_mask = ibd.filter_diff(mask_pair, 'median', 5)
            error = (filtered_mask != mask_pair)
            # Output only segments of 1's of at least a trivially-small length
            # print filtered_mask
            segments = ibd.segment.segments_with_value(filtered_mask, True, 3)
            if self.params.debug:
                print '--- IBD segments between (%d,%d), (%d,%d)' % (
                    self.i, i_allele, j, j_allele)
            min_len = self.params.min_len  # @UnusedVariable
            len_unit = self.params.len_unit
            long_segments = [
                Segment(s, [(self.i, i_allele), (j, j_allele)],
                        (bp_start, bp_stop),
                        error_snps=np.where(ibd.segment.is_in_segment(
                            error, s)),
                        confidence=mask_prob_ibd[s[START]:s[STOP]],
                        cm=(cm_start, cm_stop),
                        collapse_to_set=False)
                for s, bp_start, bp_stop, cm_start, cm_stop in (
                    (t, self.bp[t[START]],
                     im.segment.stop_bp(self.bp, t[STOP], self.num_snps),
                     self.cm[t[START]],
                     im.segment.stop_bp(self.cm, t[STOP], self.num_snps))
                    for t in segments) if ((cm_stop - cm_start >= min_len) if (
                        len_unit == 'cm') else (
                            bp_stop - bp_start >= im.constants.MEGA_BASE_PAIR *
                            min_len))
            ]
            ibd_segments += long_segments
            if self.params.debug:
                print 'Long segments (>= %.2f %s):' % (min_len, len_unit)
                print '\n'.join(
                    '\t' + repr(segment)
                    for segment in long_segments) if long_segments else '\t-'
                print ''
        return ibd_segments