def phase_parent_by_template(self, info): '''Impute the parent haplotype. A random parent random phase is selected and switched at each recombination location in the template child.''' bp = self.problem.info.snp['base_pair'] num_snps = self.problem.num_snps parent = info.parent child = info.template_child parent_type = info.parent_type if self.params.debug: print 'phase_parent_by_template(parent=%d, template=%d, parent_type=%d)' % (parent, child, parent_type) # h = problem.haplotype.data # Infer the parent's haplotypes from the template child's recombinations r r = info.recombination_snp edge = [y for (x, y) in r if x == child] # r = np.concatenate(([-1], edge, [problem.num_snps-1])) phase = PATERNAL # Random phase set at the starting of the chromosome segments = segment.edges_to_segments(self.problem.snp_range, edge, phase, self.problem.num_snps, cover=True) segment_set = SegmentSet((Segment((x[0], x[1]), ((parent, x[2]), (child, parent_type)), (bp[x[0]], im.segment.stop_bp(bp, x[1], num_snps))) for x in segments)) # Only a single sweep suffices here, since only two samples are involved. If a homogeneous # entry is discovered and used to propagate phases in the second of the two IBD sets in # a SNP range, there's no need to fix the first one since the data is already fully filled. ibd.phase_by_ibd(self.request, [segment_set], 'max', num_sweeps=1)
def ibs_segments(self): '''Convert recombinations r to IBD segments. A generator.''' if not self.snps_exist: return r = self.recombination_index bp = self.problem.info.snp['base_pair'] # print r # Calculate IBD segments between all children and parent for child in self.children: # print '-------------------- child %d ----------------------' % (child,) edge = [y for (x, y) in r if x == child] phase = PATERNAL # Random phase set at the starting of the chromosome segments = segment.edges_to_segments(self.snps, edge, phase, self.num_snps, cover=True) if self.debug: print 'child', child, 'edge', edge, 'segments', segments # Flip initial phase if parent is phased at hap snps by comparing its # haplotypes with the child haplotypes on the longest IBD segment i = np.argmax(np.diff(segments)[:, 0]) segment_phase = phase if np.mod(i, 2) == 0 else 1 - phase actual_phase = self.__equal_parent_hap(child, segments[i, 0], segments[i, 1]) if segment_phase != actual_phase: segments = segment.flip_phase(segments) # Output segments in the standard IBD segment format yield SegmentSet((Segment((x[0], x[1]), ((self.parent, x[2]), (child, self.parent_type)), (bp[x[0]], im.segment.stop_bp(bp, x[1], self.num_snps))) for x in segments))
def phase_parent_by_template(self, info): '''Impute the parent haplotype. A random parent random phase is selected and switched at each recombination location in the template child.''' bp = self.problem.info.snp['base_pair'] num_snps = self.problem.num_snps parent = info.parent child = info.template_child parent_type = info.parent_type if self.params.debug: print 'phase_parent_by_template(parent=%d, template=%d, parent_type=%d)' % ( parent, child, parent_type) # h = problem.haplotype.data # Infer the parent's haplotypes from the template child's recombinations r r = info.recombination_snp edge = [y for (x, y) in r if x == child] # r = np.concatenate(([-1], edge, [problem.num_snps-1])) phase = PATERNAL # Random phase set at the starting of the chromosome segments = segment.edges_to_segments(self.problem.snp_range, edge, phase, self.problem.num_snps, cover=True) segment_set = SegmentSet( (Segment((x[0], x[1]), ((parent, x[2]), (child, parent_type)), (bp[x[0]], im.segment.stop_bp(bp, x[1], num_snps))) for x in segments)) # Only a single sweep suffices here, since only two samples are involved. If a homogeneous # entry is discovered and used to propagate phases in the second of the two IBD sets in # a SNP range, there's no need to fix the first one since the data is already fully filled. ibd.phase_by_ibd(self.request, [segment_set], 'max', num_sweeps=1)
def ibs_segments(self): '''Convert recombinations r to IBD segments. A generator.''' if not self.snps_exist: return r = self.recombination_index bp = self.problem.info.snp['base_pair'] # print r # Calculate IBD segments between all children and parent for child in self.children: # print '-------------------- child %d ----------------------' % (child,) edge = [y for (x, y) in r if x == child] phase = PATERNAL # Random phase set at the starting of the chromosome segments = segment.edges_to_segments(self.snps, edge, phase, self.num_snps, cover=True) if self.debug: print 'child', child, 'edge', edge, 'segments', segments # Flip initial phase if parent is phased at hap snps by comparing its # haplotypes with the child haplotypes on the longest IBD segment i = np.argmax(np.diff(segments)[:, 0]) segment_phase = phase if np.mod(i, 2) == 0 else 1 - phase actual_phase = self.__equal_parent_hap(child, segments[i, 0], segments[i, 1]) if segment_phase != actual_phase: segments = segment.flip_phase(segments) # Output segments in the standard IBD segment format yield SegmentSet((Segment( (x[0], x[1]), ((self.parent, x[2]), (child, self.parent_type)), (bp[x[0]], im.segment.stop_bp(bp, x[1], self.num_snps))) for x in segments))
def ibs_segments(haplotype, id1, id2, hap1_type, hap2_type, snps=None, include_alt_phase=False, error_filter='median', error_filter_length=5, length_bound=None, min_segment_length=INDETERMINATE, margin=0.0, debug=False): '''Return 1) Identical-by-State (IBS) segments separated by recombination events between two sample haplotypes (id1, hap1_type) and (id2, hap2_type). The 2-D output array's ith row format is (segment_start, segment_stop), (id1, hap1), (id2, hap2), (segment_start_bp, segment_stop_bp, segment_length_in_bp, num_errors_in_segment) The SNP range is [segment_start, segment_stop) where start=inclusive and stop is exclusive. 2) List of het_snp indices at which there are likely genotype errors. Options: snps - list of SNPs to base the comparison on. For parent-child comparisons, these should be heterozygous SNPs in the parent's genotype, distinguishing its haplotypes and used to locate segments. For unphased-phased individuals, these should be the list of homozygous SNPs at the unphased individual (those that have data). If not specified, all SNPs are used. length_bound - minimum segment length bound type: None: no lower bound enforced 'base_pair': output segments of at least min_segment_length [base pair] 'snp': output segments of at least min_segment_length consecutive SNPs out of the snps list. This is useful only if snps includes all SNPs (or is None) *NOTE*: min_segment_length''s units are interpreted differently depending on length_bound. margin = fraction of segment to discard near the endpoints (margin/2 is removed from each side).''' if debug: print 'Computing IBD segments between haplotypes (%d,%d), (%d,%d); filter %s length %d' % \ (id1, hap1_type, id2, hap2_type, error_filter, error_filter_length) d = diff.all_diffs(haplotype.data, id1, id2, hap1_type=hap1_type, hap2_type=hap2_type)[0] # Segment length, as defined by the input parameters segment_length = lambda f: np.inf if not length_bound else ( f.length if length_bound == 'base_pair' else f.num_snps) # @UnusedVariable # Consider informative or the specified SNPs only snps = snps if snps is not None else haplotype.snp_range snps = np.intersect1d(snps, np.where(d != INDETERMINATE)[0]) d_snps = d[snps] filtered_diff = filter_diff(d_snps, error_filter, error_filter_length) error_snps = snps[np.nonzero(d_snps - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments bp = haplotype.snp['base_pair'] num_snps = haplotype.num_snps if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: deriv = ndimage.convolve(filtered_diff, [1, -1]) edge = np.where(deriv != 0)[0] initial_phase = hap1_type if filtered_diff[0] == 0 else 1 - hap1_type if debug: print 'initial_phase', initial_phase # , 'edge', edge # Convert recombination locations to segments of no recombination; filter short segments segments = [ f for f in ( Segment(((x[0], x[1])), set(((id1, x[2]), (id2, hap2_type))), ( bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x)) for x in segment.edges_to_segments( snps, edge, initial_phase, haplotype.num_snps, hap1_type)) if segment_length(f) >= min_segment_length ] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [ s for s in (s.middle_part(haplotype.nearest_snp, bp, margin) for s in segments) if s ] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set
def ibs_segments(haplotype, id1, id2, hap1_type, hap2_type, snps=None, include_alt_phase=False, error_filter='median', error_filter_length=5, length_bound=None, min_segment_length=INDETERMINATE, margin=0.0, debug=False): '''Return 1) Identical-by-State (IBS) segments separated by recombination events between two sample haplotypes (id1, hap1_type) and (id2, hap2_type). The 2-D output array's ith row format is (segment_start, segment_stop), (id1, hap1), (id2, hap2), (segment_start_bp, segment_stop_bp, segment_length_in_bp, num_errors_in_segment) The SNP range is [segment_start, segment_stop) where start=inclusive and stop is exclusive. 2) List of het_snp indices at which there are likely genotype errors. Options: snps - list of SNPs to base the comparison on. For parent-child comparisons, these should be heterozygous SNPs in the parent's genotype, distinguishing its haplotypes and used to locate segments. For unphased-phased individuals, these should be the list of homozygous SNPs at the unphased individual (those that have data). If not specified, all SNPs are used. length_bound - minimum segment length bound type: None: no lower bound enforced 'base_pair': output segments of at least min_segment_length [base pair] 'snp': output segments of at least min_segment_length consecutive SNPs out of the snps list. This is useful only if snps includes all SNPs (or is None) *NOTE*: min_segment_length''s units are interpreted differently depending on length_bound. margin = fraction of segment to discard near the endpoints (margin/2 is removed from each side).''' if debug: print 'Computing IBD segments between haplotypes (%d,%d), (%d,%d); filter %s length %d' % \ (id1, hap1_type, id2, hap2_type, error_filter, error_filter_length) d = diff.all_diffs(haplotype.data, id1, id2, hap1_type=hap1_type, hap2_type=hap2_type)[0] # Segment length, as defined by the input parameters segment_length = lambda f: np.inf if not length_bound else (f.length if length_bound == 'base_pair' else f.num_snps) # @UnusedVariable # Consider informative or the specified SNPs only snps = snps if snps is not None else haplotype.snp_range snps = np.intersect1d(snps, np.where(d != INDETERMINATE)[0]) d_snps = d[snps] filtered_diff = filter_diff(d_snps, error_filter, error_filter_length) error_snps = snps[np.nonzero(d_snps - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments bp = haplotype.snp['base_pair'] num_snps = haplotype.num_snps if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: deriv = ndimage.convolve(filtered_diff, [1, -1]) edge = np.where(deriv != 0)[0] initial_phase = hap1_type if filtered_diff[0] == 0 else 1 - hap1_type if debug: print 'initial_phase', initial_phase # , 'edge', edge # Convert recombination locations to segments of no recombination; filter short segments segments = [f for f in (Segment(((x[0], x[1])), set(((id1, x[2]), (id2, hap2_type))), (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x)) for x in segment.edges_to_segments(snps, edge, initial_phase, haplotype.num_snps, hap1_type)) if segment_length(f) >= min_segment_length] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [s for s in (s.middle_part(haplotype.nearest_snp, bp, margin) for s in segments) if s] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set