def write_imputed(t, out, debug=False, poo_phase=None): '''Write imputed genotypes to the stream out in CGI format.''' data, metadata, hap_type = im.recode.recode_cgi(t.imputed_data), t.genotype.metadata, t.imputed_hap_type if poo_phase is not None: aligned_samples = np.where(poo_phase)[0] t.imputed_hap_type[:, aligned_samples] = im.constants.PHASED_WITH_ORIGIN # Flip haplotypes of samples with flipped POO phase flipped_samples = np.where(poo_phase < 0)[0] orig = flattened_meshgrid(flipped_samples, im.constants.ALLELES) flipped = flattened_meshgrid(flipped_samples, list(reversed(im.constants.ALLELES))) data[:, orig[0], orig[1]] = data[:, flipped[0], flipped[1]] if debug: np.set_printoptions(threshold=np.nan) for snp in t.genotype.snp_range: # Ensure that all fields are non-empty - easier to parse by subsequent processes if metadata: np.savetxt(out, np.array(map(lambda x: x if x else '-', metadata[snp])), fmt='%s', newline='\011', delimiter='') # Remove trailing tab at the end of the line produced by the numpy savetxt call out_str = StringIO.StringIO() np.savetxt(out_str, [x[0] + x[1] for x in it.izip(it.imap(str, hap_type[snp]), (g[0] + g[1] for g in data[snp]))], fmt='%s', newline='\011', delimiter='') out.write(out_str.getvalue()[:-1]) out.write('\n') out.flush() if debug: print np.concatenate((np.arange(data.shape[1])[np.newaxis].transpose(), hap_type[snp][np.newaxis].transpose(), t.imputed_data[snp]), axis=1) if debug: np.set_printoptions(threshold=1000)
def __init__(self, problem, fraction=None, test_index=None): '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index is specified, these specific test indices are used; otherwise a random fraction is generated. If test_index = 'hap', data is read from problem.h (haplotype array). The entire array is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.''' # Create a working copy of the problem. Only the data is copied. if not (fraction is not None) ^ (test_index is not None): raise ValueError('Must specify fraction or test_index') self.problem = Problem(problem.pedigree, problem.genotype.copy()) self.h = self.problem.h # Create test set; save original genotypes in g_orig if test_index is None: self.fraction = fraction self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction) elif test_index == 'hap': # Don't clear anything; call everything a test index. h = problem.h i = tuple(util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1]))) self.g_orig = problem.g self.h = h self.fraction = 1.0 else: self.g_orig, i = clear_index(self.problem.g, test_index) self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1]) self.num_tests = i[0].size self.test_index = i self.r_orig = recode.recode_single_genotype(self.g_orig) self.fill = self.problem.fill_fraction()[:, SAMPLE] self.__recode_single_genotype = None
def __init__(self, problem, fraction=None, test_index=None): '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index is specified, these specific test indices are used; otherwise a random fraction is generated. If test_index = 'hap', data is read from problem.h (haplotype array). The entire array is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.''' # Create a working copy of the problem. Only the data is copied. if not (fraction is not None) ^ (test_index is not None): raise ValueError('Must specify fraction or test_index') self.problem = Problem(problem.pedigree, problem.genotype.copy()) self.h = self.problem.h # Create test set; save original genotypes in g_orig if test_index is None: self.fraction = fraction self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction) elif test_index == 'hap': # Don't clear anything; call everything a test index. h = problem.h i = tuple( util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1]))) self.g_orig = problem.g self.h = h self.fraction = 1.0 else: self.g_orig, i = clear_index(self.problem.g, test_index) self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1]) self.num_tests = i[0].size self.test_index = i self.r_orig = recode.recode_single_genotype(self.g_orig) self.fill = self.problem.fill_fraction()[:, SAMPLE] self.__recode_single_genotype = None
def genotype_ibs_segments(genotype, id1, id2, snps, error_filter='median', error_filter_length=5, margin=0.0, min_ibs_len_snp=400, debug=False): '''Return Identical-by-State (IBS >= 1) segments between two genoypes of samples id1 and id2 in the SNP range [snp[0],snp[1]) (if snp is a tuple) or the subset of SNPs, if snps is an array. See ibs_segments() for a description of optional parameters.''' num_snps = genotype.num_snps g = genotype.data g1 = recode.recode_single_genotype(g[snps, id1, :]) g2 = recode.recode_single_genotype(g[snps, id2, :]) d = (recode.ibs_state(g1, g2) == 0).astype(np.byte) # Consider informative or the specified SNPs only filtered_diff = filter_diff(d, error_filter, error_filter_length) error_snps = snps[np.nonzero(d - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: # Convert recombination locations to segments of no recombination; filter short segments bp = genotype.snp['base_pair'] #print segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp) segments = [ Segment(((x[0], x[1])), [id1, id2], (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x), collapse_to_set=False) for x in segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp) ] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [ s for s in (s.middle_part( genotype.nearest_snp, bp, margin, collapse_to_set=False) for s in segments) if s ] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set
def write_imputed(t, out, debug=False, poo_phase=None): '''Write imputed genotypes to the stream out in CGI format.''' data, metadata, hap_type = im.recode.recode_cgi( t.imputed_data), t.genotype.metadata, t.imputed_hap_type if poo_phase is not None: aligned_samples = np.where(poo_phase)[0] t.imputed_hap_type[:, aligned_samples] = im.constants.PHASED_WITH_ORIGIN # Flip haplotypes of samples with flipped POO phase flipped_samples = np.where(poo_phase < 0)[0] orig = flattened_meshgrid(flipped_samples, im.constants.ALLELES) flipped = flattened_meshgrid(flipped_samples, list(reversed(im.constants.ALLELES))) data[:, orig[0], orig[1]] = data[:, flipped[0], flipped[1]] if debug: np.set_printoptions(threshold=np.nan) for snp in t.genotype.snp_range: # Ensure that all fields are non-empty - easier to parse by subsequent processes if metadata: np.savetxt(out, np.array(map(lambda x: x if x else '-', metadata[snp])), fmt='%s', newline='\011', delimiter='') # Remove trailing tab at the end of the line produced by the numpy savetxt call out_str = StringIO.StringIO() np.savetxt(out_str, [ x[0] + x[1] for x in it.izip(it.imap(str, hap_type[snp]), (g[0] + g[1] for g in data[snp])) ], fmt='%s', newline='\011', delimiter='') out.write(out_str.getvalue()[:-1]) out.write('\n') out.flush() if debug: print np.concatenate( (np.arange(data.shape[1])[np.newaxis].transpose(), hap_type[snp][np.newaxis].transpose(), t.imputed_data[snp]), axis=1) if debug: np.set_printoptions(threshold=1000)
def __error_index(self, diff, errors, num_errors, error_type): '''Return the corresponding snp and child indices of genotype errors. These are rows in the errors array that have error_type non-zeros.''' if error_type == FamilyIbdComputer.NON_TEMPLATE: # Non-template children errors snp_index = errors[np.where(num_errors == 1)[0]] return (self.snps[snp_index], self.children[np.where(diff[snp_index, :] != 0)[1]]) if error_type == FamilyIbdComputer.TEMPLATE: # Template child errors snp_index = errors[np.where(num_errors == self.num_children - 1)[0]] return (self.snps[snp_index], np.tile(self.children[self.template], (len(snp_index),))) else: # Indecisive cases: flag parent+all children as errors -- the best we can do for now. snp_index = errors[np.where(np.logical_and(num_errors != 1, num_errors != self.num_children - 1))[0]] a = util.flattened_meshgrid(np.concatenate((self.children, [self.parent])), self.snps[snp_index]) return (a[1], a[0])
def genotype_ibs_segments(genotype, id1, id2, snps, error_filter='median', error_filter_length=5, margin=0.0, min_ibs_len_snp=400, debug=False): '''Return Identical-by-State (IBS >= 1) segments between two genoypes of samples id1 and id2 in the SNP range [snp[0],snp[1]) (if snp is a tuple) or the subset of SNPs, if snps is an array. See ibs_segments() for a description of optional parameters.''' num_snps = genotype.num_snps g = genotype.data g1 = recode.recode_single_genotype(g[snps, id1, :]) g2 = recode.recode_single_genotype(g[snps, id2, :]) d = (recode.ibs_state(g1, g2) == 0).astype(np.byte) # Consider informative or the specified SNPs only filtered_diff = filter_diff(d, error_filter, error_filter_length) error_snps = snps[np.nonzero(d - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: # Convert recombination locations to segments of no recombination; filter short segments bp = genotype.snp['base_pair'] #print segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp) segments = [Segment(((x[0], x[1])), [id1, id2], (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x), collapse_to_set=False) for x in segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp)] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [s for s in (s.middle_part(genotype.nearest_snp, bp, margin, collapse_to_set=False) for s in segments) if s] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set
def impute(self, samples=None): """Infer imputed genotypes at all samples of h from the samples of g, at the location between k-1 and k.""" # Phase all hom training samples self.__phase_hom() # Phase as much as possible non-hom training samples # Bootstrap: impute R -> T; pass imputed samples from T to R; repeat R = azip(*util.flattened_meshgrid(self.hom, ALLELES)) T = set(self.non_hom) R, _ = self._impute_bootstrap(R, T, self.num_passes_training, True, "non-hom typed") # Impute target samples from all available training haps T = set(range(self.h.num_samples)) - set(self.training_sample_index) if samples is not None: T &= set(samples) R, T = self._impute_bootstrap(R, T, self.num_passes_training, False, "non-typed") return self.result
def impute(self): '''Infer imputed genotypes at all samples of h from the samples of g, at the location between k-1 and k.''' # Phase all hom training samples # print '#' * 80 # print '#' * 80 self.__phase_hom() print 'Imputing genotyped hom', '|hom|', len( self.hom), '|non_hom|', len(self.non_hom) # Phase as much as possible non-hom training samples R = util.flattened_meshgrid(self.hom, ALLELES) # set_printoptions(threshold=np.nan) # R = [np.array([1053]), np.array([0])] if self.debug: print '#' * 80 print 'Imputing genotyped non-hom', '|R|', len(R[0]), '|T|', len( self.non_hom) if self.debug: print '#' * 80 print 'R', R print 'T', self.non_hom for sample in self.non_hom: self._impute_sample(sample, R, True) # Impute non-training samples from all available training haps # R = [np.array([1053, 0]), np.array([0])] R = self.result.nonzero() T = set(range(self.h.num_samples)) - set(self.training_sample_index) if self.debug: print '#' * 80 print 'Imputing non-genotyped', '|R|', len(R[0]), '|T|', len(T) if self.debug: print '#' * 80 print 'R', R print 'T', self.non_hom for sample in T: self._impute_sample(sample, R, False) return self.result
def impute(self): '''Infer imputed genotypes at all samples of h from the samples of g, at the location between k-1 and k.''' # Phase all hom training samples # print '#' * 80 # print '#' * 80 self.__phase_hom() print 'Imputing genotyped hom', '|hom|', len(self.hom), '|non_hom|', len(self.non_hom) # Phase as much as possible non-hom training samples R = util.flattened_meshgrid(self.hom, ALLELES) # set_printoptions(threshold=np.nan) # R = [np.array([1053]), np.array([0])] if self.debug: print '#' * 80 print 'Imputing genotyped non-hom', '|R|', len(R[0]), '|T|', len(self.non_hom) if self.debug: print '#' * 80 print 'R', R print 'T', self.non_hom for sample in self.non_hom: self._impute_sample(sample, R, True) # Impute non-training samples from all available training haps # R = [np.array([1053, 0]), np.array([0])] R = self.result.nonzero() T = set(range(self.h.num_samples)) - set(self.training_sample_index) if self.debug: print '#' * 80 print 'Imputing non-genotyped', '|R|', len(R[0]), '|T|', len(T) if self.debug: print '#' * 80 print 'R', R print 'T', self.non_hom for sample in T: self._impute_sample(sample, R, False) return self.result
def impute(self, samples=None): '''Infer imputed genotypes at all samples of h from the samples of g, at the location between k-1 and k.''' # Phase all hom training samples self.__phase_hom() # Phase as much as possible non-hom training samples # Bootstrap: impute R -> T; pass imputed samples from T to R; repeat R = azip(*util.flattened_meshgrid(self.hom, ALLELES)) T = set(self.non_hom) R, _ = self._impute_bootstrap(R, T, self.num_passes_training, True, 'non-hom typed') # Impute target samples from all available training haps T = set(range(self.h.num_samples)) - set(self.training_sample_index) if samples is not None: T &= set(samples) R, T = self._impute_bootstrap(R, T, self.num_passes_training, False, 'non-typed') return self.result
def __error_index(self, diff, errors, num_errors, error_type): '''Return the corresponding snp and child indices of genotype errors. These are rows in the errors array that have error_type non-zeros.''' if error_type == FamilyIbdComputer.NON_TEMPLATE: # Non-template children errors snp_index = errors[np.where(num_errors == 1)[0]] return (self.snps[snp_index], self.children[np.where(diff[snp_index, :] != 0)[1]]) if error_type == FamilyIbdComputer.TEMPLATE: # Template child errors snp_index = errors[np.where(num_errors == self.num_children - 1)[0]] return (self.snps[snp_index], np.tile(self.children[self.template], (len(snp_index), ))) else: # Indecisive cases: flag parent+all children as errors -- the best we can do for now. snp_index = errors[np.where( np.logical_and(num_errors != 1, num_errors != self.num_children - 1))[0]] a = util.flattened_meshgrid( np.concatenate((self.children, [self.parent])), self.snps[snp_index]) return (a[1], a[0])
def ibs_segments(haplotype, id1, id2, hap1_type, hap2_type, snps=None, include_alt_phase=False, error_filter='median', error_filter_length=5, length_bound=None, min_segment_length=INDETERMINATE, margin=0.0, debug=False): '''Return 1) Identical-by-State (IBS) segments separated by recombination events between two sample haplotypes (id1, hap1_type) and (id2, hap2_type). The 2-D output array's ith row format is (segment_start, segment_stop), (id1, hap1), (id2, hap2), (segment_start_bp, segment_stop_bp, segment_length_in_bp, num_errors_in_segment) The SNP range is [segment_start, segment_stop) where start=inclusive and stop is exclusive. 2) List of het_snp indices at which there are likely genotype errors. Options: snps - list of SNPs to base the comparison on. For parent-child comparisons, these should be heterozygous SNPs in the parent's genotype, distinguishing its haplotypes and used to locate segments. For unphased-phased individuals, these should be the list of homozygous SNPs at the unphased individual (those that have data). If not specified, all SNPs are used. length_bound - minimum segment length bound type: None: no lower bound enforced 'base_pair': output segments of at least min_segment_length [base pair] 'snp': output segments of at least min_segment_length consecutive SNPs out of the snps list. This is useful only if snps includes all SNPs (or is None) *NOTE*: min_segment_length''s units are interpreted differently depending on length_bound. margin = fraction of segment to discard near the endpoints (margin/2 is removed from each side).''' if debug: print 'Computing IBD segments between haplotypes (%d,%d), (%d,%d); filter %s length %d' % \ (id1, hap1_type, id2, hap2_type, error_filter, error_filter_length) d = diff.all_diffs(haplotype.data, id1, id2, hap1_type=hap1_type, hap2_type=hap2_type)[0] # Segment length, as defined by the input parameters segment_length = lambda f: np.inf if not length_bound else ( f.length if length_bound == 'base_pair' else f.num_snps) # @UnusedVariable # Consider informative or the specified SNPs only snps = snps if snps is not None else haplotype.snp_range snps = np.intersect1d(snps, np.where(d != INDETERMINATE)[0]) d_snps = d[snps] filtered_diff = filter_diff(d_snps, error_filter, error_filter_length) error_snps = snps[np.nonzero(d_snps - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments bp = haplotype.snp['base_pair'] num_snps = haplotype.num_snps if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: deriv = ndimage.convolve(filtered_diff, [1, -1]) edge = np.where(deriv != 0)[0] initial_phase = hap1_type if filtered_diff[0] == 0 else 1 - hap1_type if debug: print 'initial_phase', initial_phase # , 'edge', edge # Convert recombination locations to segments of no recombination; filter short segments segments = [ f for f in ( Segment(((x[0], x[1])), set(((id1, x[2]), (id2, hap2_type))), ( bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x)) for x in segment.edges_to_segments( snps, edge, initial_phase, haplotype.num_snps, hap1_type)) if segment_length(f) >= min_segment_length ] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [ s for s in (s.middle_part(haplotype.nearest_snp, bp, margin) for s in segments) if s ] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set
def ibs_segments(haplotype, id1, id2, hap1_type, hap2_type, snps=None, include_alt_phase=False, error_filter='median', error_filter_length=5, length_bound=None, min_segment_length=INDETERMINATE, margin=0.0, debug=False): '''Return 1) Identical-by-State (IBS) segments separated by recombination events between two sample haplotypes (id1, hap1_type) and (id2, hap2_type). The 2-D output array's ith row format is (segment_start, segment_stop), (id1, hap1), (id2, hap2), (segment_start_bp, segment_stop_bp, segment_length_in_bp, num_errors_in_segment) The SNP range is [segment_start, segment_stop) where start=inclusive and stop is exclusive. 2) List of het_snp indices at which there are likely genotype errors. Options: snps - list of SNPs to base the comparison on. For parent-child comparisons, these should be heterozygous SNPs in the parent's genotype, distinguishing its haplotypes and used to locate segments. For unphased-phased individuals, these should be the list of homozygous SNPs at the unphased individual (those that have data). If not specified, all SNPs are used. length_bound - minimum segment length bound type: None: no lower bound enforced 'base_pair': output segments of at least min_segment_length [base pair] 'snp': output segments of at least min_segment_length consecutive SNPs out of the snps list. This is useful only if snps includes all SNPs (or is None) *NOTE*: min_segment_length''s units are interpreted differently depending on length_bound. margin = fraction of segment to discard near the endpoints (margin/2 is removed from each side).''' if debug: print 'Computing IBD segments between haplotypes (%d,%d), (%d,%d); filter %s length %d' % \ (id1, hap1_type, id2, hap2_type, error_filter, error_filter_length) d = diff.all_diffs(haplotype.data, id1, id2, hap1_type=hap1_type, hap2_type=hap2_type)[0] # Segment length, as defined by the input parameters segment_length = lambda f: np.inf if not length_bound else (f.length if length_bound == 'base_pair' else f.num_snps) # @UnusedVariable # Consider informative or the specified SNPs only snps = snps if snps is not None else haplotype.snp_range snps = np.intersect1d(snps, np.where(d != INDETERMINATE)[0]) d_snps = d[snps] filtered_diff = filter_diff(d_snps, error_filter, error_filter_length) error_snps = snps[np.nonzero(d_snps - filtered_diff)[0]] # Detect edges as non-zero gradient points; output sufficiently long segments bp = haplotype.snp['base_pair'] num_snps = haplotype.num_snps if np.size(filtered_diff) == 0: # No data to consider ==> no IBD intervals can be identified segments = [] else: deriv = ndimage.convolve(filtered_diff, [1, -1]) edge = np.where(deriv != 0)[0] initial_phase = hap1_type if filtered_diff[0] == 0 else 1 - hap1_type if debug: print 'initial_phase', initial_phase # , 'edge', edge # Convert recombination locations to segments of no recombination; filter short segments segments = [f for f in (Segment(((x[0], x[1])), set(((id1, x[2]), (id2, hap2_type))), (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)), error_snps=segment.in_segment(error_snps, x)) for x in segment.edges_to_segments(snps, edge, initial_phase, haplotype.num_snps, hap1_type)) if segment_length(f) >= min_segment_length] # Cut segment margins if margin >= constants.SMALL_FLOAT: segments = [s for s in (s.middle_part(haplotype.nearest_snp, bp, margin) for s in segments) if s] # Restrict errors to those inside segments segment_set = SegmentSet(segments, np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)), np.array([id1, id2])), dtype=int) \ if segments else gt.empty_errors_array()) if debug: print 'ibs_segments()', segment_set print 'errors', segment_set.errors return segment_set