def run(self, problem, params=None): '''Run the phasing processing chain. Adapts the generic Filter interface to include both a Problem and PhasingParam inputs. If params=None, using default PhaseParam values.''' '''A template method that delegates to runner(), which accepts two input parameters.''' return self.handle( util.Struct(problem=problem, params=params if params else PhaseParam()))
def test_phase_family(self): '''Check phasing trivial cases in all genotyped trios.''' problem = io.read_plink(pedigree=itu.HUTT_PED, prefix=itu.GENOTYPE_SAMPLE, haplotype=None) itu.assert_size_equals(problem.genotype, 8, 1415) assert_equal(len(problem.trios()), 869, 'Unexpected # of genotyped trios') self.phaser.run(problem, PhaseParam(debug=False)) itu.assert_problem_stats(problem, 22640, 20225, 144)
def __init__(self, params=PhaseParam()): self.params = params self.debug = params.debug # TODO: move into params self.min_segment_length = 0.0 # TODO: calculate min_segment_length from s_ibd, s, slice_size self.max_difference = 2
def ibd_germline(problem, samples): '''Return all IBD segments among the genotyped samples' haplotypes using GERMLINE. Segments are rounded to the nearest slice.''' ibd_computer = GermlineIbdComputer(PhaseParam()) h_mat = _HapMatrix(problem, gt.genotyped(problem, samples)) m = ibd_computer.ibd_segments(h_mat) m.group_to_disjoint() return m
def setUp(self): '''Load single nuclear family test case.''' # Remove a key child to make problem more interesting for the IBD algorithm self.problem = im.io.read_npz( itu.FAMILY945_ONE_PARENT_STAGE2).remove_nodes([2]) self.family = self.problem.families(genotyped=False)[0] self.sibs = ig._filled_members(self.problem, self.family) self.ibd_computer = ig.GermlineIbdComputer(PhaseParam())
def test_family_12(self): '''Test comparing sibs with non-genotyped parents (stage 4).''' problem = io.read_npz(itu.FAMILY12_STAGE2) itu.assert_size_equals(problem.genotype, 3218, 7) itu.assert_problem_stats(problem, 45052, 42162, 237) assert_equal(len(list(problem.families(genotyped=False))), 1, 'Incorrect number of families') phaser = family_sib_comparison_phaser() phaser.run(problem, PhaseParam(single_member=1)) itu.assert_problem_stats(problem, 45052, 42162, 237)
def __main(options): ''' -------------------------------------------------- Main program - accepts an options struct. -------------------------------------------------- ''' if options.debug: print 'Input options', options print 'Building phaser (stage = %d) ...' % (options.stage, ) phaser = build_phasing_pipeline(options) if options.debug: print 'Reading data ...' problem = __load_problem(options) if options.debug: print 'Phasing ...' params = PhaseParam() params.update_from_struct(options) request = run_phasing_chain(phaser, problem, params) print '' request.stats.pprint() print '' if options.output is not None: if options.min_output: print 'Minimizing output size...' io.slim(problem) out_prefix, ext = os.path.splitext(options.output) if ext == '.npz': print 'Writing haplotype result to %s in NPZ format ...' % ( options.output, ) io.write_npz(problem, options.output) output_info = out_prefix + '.info.npz' print 'Writing problem info result to %s in NPZ format ...' % ( output_info, ) io.write_info_npz(problem.info, output_info) else: print 'Writing haplotype result to %s in PLINK format ...' % ( options.output, ) io.write_plink(problem, options.output, verbose=options.debug) return problem
def test_ibd_segments_sib_pair2(self): '''Test calculating distant IBD segments against a single surrogate parent; compare with IBD segments based on nucelar family info.''' segment_set = ibd_segments_with_relatives(self.problem, 3, 5, PhaseParam(id_coef_file=im.itu.ID_COEF_FILE, max_path_length=2), im.ibd_hmm.prob_ibd_hmm) im.itu.assert_segments_almost_equal(segment_set, [((1278, 1337), (31217345, 32331594, 1.114, 1), ((3, 0), (5, 0))), ((2276, 2363), (42344297, 43527681, 1.183, 1), ((3, 1), (5, 0))), ((3138, 3206), (49803008, 50837224, 1.034, 1), ((3, 1), (5, 0))), ((603 , 3218), (25453554, 51156934, 25.703, 1), ((5, 1), (3, 1)))], full_data=True, decimal=3, err_msg='Wrong IBD segments')
def test_ibd_segments_sib_pair(self): '''Test calculating distant IBD segments between a pair of sibs; compare with IBD segments based on nucelar family info''' segment_set = ibd_segments_with_relatives(self.problem, 3, 2, PhaseParam(id_coef_file=im.itu.ID_COEF_FILE), im.ibd_hmm.prob_ibd_hmm) im.itu.assert_segments_almost_equal(segment_set, [((1412, 3218), (32992389, 51156934, 18.165, 1), ((3, 0), (2, 0))), ((241 , 451), (19643555, 23817486, 4.174, 1), ((2, 0), (3, 1))), ((0 , 600), (16484792, 25444874, 8.960, 1), ((3, 1), (2, 1))), ((2650, 3218), (45892433, 51156934, 5.265, 1), ((3, 1), (2, 1)))], full_data=True, decimal=3, err_msg='Wrong IBD segments')
def test_ibd_segments_ibdld(self): '''Calculate IBD segments in a nuclear family using IBDLD.''' segment_cache = im.ibdld.ibd_ld.IbdSegmentGlobalCacheIbdld(itu.FAMILY7 + '.ibd') segment_computer = im.ibdld.ibd_ld.IbdSegmentComputerIbdld(segment_cache, self.haplotype, chrom=22, sample_id=self.problem.pedigree.sample_id, samples=[2, 8], threshold=0.9, params=PhaseParam()) segment_set = segment.break_and_group_segments(segment_computer.segments) assert_segments_almost_equal(segment_set, [((38 , 2849), [], ((8, 1), (2, 1)))], full_data=False, decimal=3, err_msg='Wrong grouped IBDLD IBD segments') assert_equal(segment_set.errors, empty_errors_array(), 'IBDLD does not support errors but they are output?!')
def run_phasing_chain(phaser, problem, params=None): '''The main call that runs the phasing, stats saving , and post-processing as one long pipeline. Returns the populated request object.''' request = util.Struct(problem=problem, params=params if params else PhaseParam(), g_orig=problem.genotype.data.copy(), stats=util.Struct()) # Run phasing processing chain start = time.time() phaser.handle(request) t = time.time() - start request.stats.time = t return request
def __main(options): ''' -------------------------------------------------- Main program - accepts an options struct. -------------------------------------------------- ''' if options.debug: print 'Input options', options print 'Building phaser (stage = %d) ...' % (options.stage,) phaser = build_phasing_pipeline(options) if options.debug: print 'Reading data ...' problem = __load_problem(options) if options.debug: print 'Phasing ...' params = PhaseParam() params.update_from_struct(options) request = run_phasing_chain(phaser, problem, params) print '' request.stats.pprint() print '' if options.output is not None: if options.min_output: print 'Minimizing output size...' io.slim(problem) out_prefix, ext = os.path.splitext(options.output) if ext == '.npz': print 'Writing haplotype result to %s in NPZ format ...' % (options.output,) io.write_npz(problem, options.output) output_info = out_prefix + '.info.npz' print 'Writing problem info result to %s in NPZ format ...' % (output_info,) io.write_info_npz(problem.info, output_info) else: print 'Writing haplotype result to %s in PLINK format ...' % (options.output,) io.write_plink(problem, options.output, verbose=options.debug) return problem
def setUp(self): '''Load test data and expected results.''' unittest.TestCase.setUp(self) # Load test data ready from previous phasing stagees self.problem = io.read_npz(itu.FAMILY13_STAGE2) self.family = self.problem.families()[0] self.phaser = phase_core.PhaseDecorator( FilterChain([ trivial_phaser(), family_phaser(), family_child_comparison_phaser() ])) self.comparator = ic.ChildComparator( Struct(problem=self.problem, params=PhaseParam()), self.family)
def test_child_comparison_phaser(self): '''Test phasing a founder parent by comparing its partially-phased children. Test main phasing method here.''' h = self.problem.haplotype (f, m) = (self.family.father, self.family.mother) assert_almost_equal(h.fill_fraction(sample=f), 0.60, 2, 'Unexpected pre-phasing parent fill %') assert_almost_equal(h.fill_fraction(sample=m), 0.63, 2, 'Unexpected pre-phasing parent fill %') #print self.problem.fill_fraction(sample=self.family.member_set) phaser = family_child_comparison_phaser() phaser.run(self.problem, PhaseParam()) #print self.problem.fill_fraction(sample=self.family.member_set) assert_almost_equal(h.fill_fraction(sample=f), 0.998, 3, 'Unexpected post-phasing parent fill %') assert_almost_equal(h.fill_fraction(sample=m), 0.998, 3, 'Unexpected post-phasing parent fill %')
def test_ibd_parent_vs_all_children(self): '''Test calculating distant IBD segments against all surrogate parents; compare with IBD segments based on nucelar family info.''' # segment_set = ibd_segments_with_surrogate_parents(self.problem, 0, # PhaseParam(margin=0., surrogate_parent_fill_threshold=0.9, # max_path_length=2, debug=True), # prob_ibd_calculator=im.ibd_hmm.prob_ibd_hmm, # is_i_phased=True) # Turn off kinship-based POO determination IBD segment computation since we don't have # a complete pedigree here segment_set = ibd_segments_with_relatives(self.problem, 0, genotyped_children(self.problem, self.problem.first_family), PhaseParam(id_coef_file=im.itu.ID_COEF_FILE, max_path_length=2), im.ibd_hmm.prob_ibd_hmm, use_kinship=False) im.itu.assert_segments_almost_equal(segment_set, [((0 , 1420), (16484792, 33032458, 16.548, 1), ((2, 0), (0, 0))), ((241 , 446), (19643555, 23761236, 4.118, 1), ((0, 0), (2, 1))), ((2215, 2270), (40876234, 42241372, 1.365, 1), ((0, 0), (2, 1))), ((3138, 3206), (49803008, 50837224, 1.034, 1), ((0, 0), (2, 1))), ((1278, 1337), (31217345, 32331594, 1.114, 1), ((0, 1), (2, 0))), ((1411, 3218), (32978753, 51156934, 18.178, 1), ((0, 1), (2, 0))), ((1278, 1337), (31217345, 32331594, 1.114, 1), ((3, 0), (0, 0))), ((241 , 451), (19643555, 23817486, 4.174, 1), ((0, 0), (3, 1))), ((2276, 2363), (42344297, 43527681, 1.183, 1), ((0, 0), (3, 1))), ((3138, 3206), (49803008, 50837224, 1.034, 1), ((0, 0), (3, 1))), ((0 , 3218), (16484792, 51156934, 34.672, 1), ((0, 1), (3, 0))), ((1278, 1337), (31217345, 32331594, 1.114, 1), ((0, 0), (4, 0))), ((2552, 3218), (45011952, 51156934, 6.145, 1), ((0, 0), (4, 0))), ((385 , 451), (22583252, 23817486, 1.234, 1), ((0, 0), (4, 1))), ((2215, 2270), (40876234, 42241372, 1.365, 1), ((0, 0), (4, 1))), ((0 , 2571), (16484792, 45231758, 28.747, 1), ((0, 1), (4, 0))), ((0 , 3218), (16484792, 51156934, 34.672, 1), ((0, 0), (5, 0))), ((385 , 451), (22583252, 23817486, 1.234, 1), ((5, 1), (0, 0))), ((2276, 2363), (42344297, 43527681, 1.183, 1), ((5, 1), (0, 0))), ((3138, 3206), (49803008, 50837224, 1.034, 1), ((5, 1), (0, 0))), ((1278, 1337), (31217345, 32331594, 1.114, 1), ((0, 1), (5, 0))), ((0 , 805), (16484792, 27119061, 10.634, 1), ((0, 0), (6, 0))), ((1278, 1337), (31217345, 32331594, 1.114, 1), ((0, 0), (6, 0))), ((241 , 347), (19643555, 22015144, 2.372, 1), ((6, 1), (0, 0))), ((385 , 451), (22583252, 23817486, 1.234, 1), ((6, 1), (0, 0))), ((2276, 2363), (42344297, 43527681, 1.183, 1), ((6, 1), (0, 0))), ((3138, 3206), (49803008, 50837224, 1.034, 1), ((6, 1), (0, 0))), ((762 , 3218), (26836780, 51156934, 24.320, 1), ((0, 1), (6, 0)))], full_data=True, decimal=3, err_msg='Wrong IBD segments')
def __init__(self, cache, haplotype, chrom, samples, sample_id, threshold, params=PhaseParam()): self.__cache = cache self.__sample_id = sample_id self.__haplotype = haplotype self.__chrom = chrom self.__samples = samples self.__num_snps = haplotype.num_snps self.__threshold = threshold self.__hap_comparator = _hap_comparator_ibdld self.__data = haplotype.data self.__params = params self._bp = haplotype.snp['base_pair'] self.__num_snps = haplotype.num_snps
def ibd_segments_with_surrogate_parents(problem, i, min_path_length, max_path_length, surrogate_parent_fill_threshold=0.9, params=PhaseParam(), prob_ibd_calculator=prob_ibd_ibs): '''A utility method that calculates likely IBD segments between i and surrogate parents. Supports minimum required IBD segment length vs. d meioses separating two samples. Assumes # an exponential distribution and computes the value x for which P(x >= X) = ibd_length_upper_percentile. Important: use a short median filter window size (~3) to catch more spikes since we are less certain that spikes are genotype errors than in parent-child IBD calculations ''' # Find the set J of phased relatives of m=max_path_length proximity; if not found, increment # m to at most max_path_length until such are found # relatives = self.filled_relatives(i, self.max_path_length, self.het_fill_threshold) relatives = RelativeCollection.in_neighborhood( i, problem, min_path_length, max_path_length, params.surrogate_parent_fill_threshold) return ibd_segments_with_relatives(problem, i, relatives.info['index'], params, prob_ibd_calculator)
def _phase_in_ibd_segment(problem, snp, haps, consensus, debug=False, params=PhaseParam()): '''Impute haplotypes in an IBD-sharing sample set haps over a SNP array snp, or a SNP array segment [snp[0],snp[1]], if it is a tuple. Use the concensus functor to calculator a consensus haplotype and copy it to all other IBD-sharing samples. haps is a list of (sample,hap) tuples, where sample=sample ID and hap=allele (paternal/maternal) that are assumed to be IBD in this segment. The other allele in each haplotype is inferred from the genotype. concensus = 'max': if a non-zero value is found, it is the concensus. This should be applied when all haps are certain, so all non-missing values should agree. This is cleanly implemented using max(h over samples). concensus = 'majority': majority vote of non-missing values.''' # print 'Phasing in segment (%d,%d)' % (start, stop, ) # problem, params = request.problem, request.params # snp_test_index = params.snp_test_index snps = np.arange(snp[0], snp[1]) if isinstance(snp, tuple) else snp common, hh = _compute_conensus(problem.h, snps, haps, consensus) # Flag samples that are inconsistent with the concensus as errors if we have enough haps # to support the evidence for errors. If there are too many errors, this is a dubious segment errors = _find_errors(problem, snps, haps, consensus, common, hh, params.min_consensus_samples) if debug: print 'Consensus errors (%d)' % len(errors[0]), errors # print 'Consensus errors %d' % len(errors[0]) problem.genotype_error(errors[0], errors[1], 'IBD majority vote inconsistency') # Phase all haps: copy consensus haplotype to missing haplotype entries _phase_by_consensus(problem, snps, haps, common)
def setUp(self): '''Load single nuclear family test case.''' self.problem = im.io.read_npz(itu.SIB_FOUNDERS_STAGE3) self.family = self.problem.families(genotyped=False)[0] self.sibs = ig._filled_members(self.problem, self.family) self.ibd_computer = ig.GermlineIbdComputer(PhaseParam())
def setUp(self): '''Load single nuclear family test case.''' self.problem = im.io.read_npz(itu.FAMILY4_STAGE3) self.family = self.problem.first_family self.sibs = ig._filled_members(self.problem, self.family) self.ibd_computer = ig.GermlineIbdComputer(PhaseParam())
============================================================ ''' import impute as im, itertools, matplotlib.pyplot as P, sys from impute.tools.param import PhaseParam # print sys.argv generate_plots = True if (len(sys.argv) < 2) else bool(int(sys.argv[1])) p = im.io.read_npz(im.itu.FAMILY_TOO_ZEROED_STAGE2) haps = list( itertools.product(im.gt.genotyped_members(p, p.first_family), xrange(2))) children = im.gt.genotyped_children(p, p.first_family) # IBD between sib pairs child = 3 sib = 2 sib_ibd = im.ibd_distant.ibd_segments_with_relatives( p, child, [sib], PhaseParam(margin=0., surrogate_parent_fill_threshold=0.9, debug=True), im.ibd_hmm.prob_ibd_hmm) print sib_ibd if generate_plots: P.figure(1) child_haps = list(itertools.product([child, sib], xrange(2))) g = im.plots.plot_hap_coloring(sib_ibd, child_haps, pair_gap=10, linewidth=6, title='Sib IBD Segments', snp_range=(0, p.num_snps)) # P.savefig(os.environ['OBER'] + '/doc/ibd/hmm/family_ibd_hmm_sib.png')
def test_ibd_segments_hmm(self): '''Test locating IBD segments between the unphased proband and its sib surrogate parents. Uses HMM IBD posterior.''' relatives = self.__phased_sibs(self.s) segment_set = im.idist.ibd_segments_with_relatives( self.problem, self.s, relatives, PhaseParam(debug=False), im.ibd_hmm.prob_ibd_hmm) segment_set.group_to_disjoint() assert_segments_almost_equal( segment_set, [((0, 96), (16484792, 17948473, 1.464, 0), ((2, 0), (3, 1), (4, 1), (2, 1))), ((96, 344), (17948473, 21460008, 3.512, 0), ((3, 1), (4, 1), (2, 1))), ((344, 380), (21460008, 22554306, 1.094, 0), ((2, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((380, 383), (22554306, 22555078, 0.001, 0), ((3, 1), (4, 1), (2, 1))), ((383, 438), (22555078, 23636541, 1.081, 0), ((2, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((438, 442), (23636541, 23695404, 0.059, 0), ((3, 1), (4, 1), (2, 1))), ((442, 519), (23695404, 24406778, 0.711, 0), ((2, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((519, 557), (24406778, 25088629, 0.682, 0), ((0, 1), (2, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((557, 951), (25088629, 27698217, 2.610, 0), ((0, 1), (3, 1), (4, 1), (2, 1))), ((951, 969), (27698217, 27832985, 0.135, 0), ((0, 1), (2, 0), (3, 1), (4, 1), (2, 1))), ((969, 1019), (27832985, 28093392, 0.260, 0), ((2, 0), (3, 1), (4, 1), (2, 1))), ((1019, 1147), (28093392, 29670939, 1.578, 0), ((2, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((1147, 1188), (29670939, 30113960, 0.443, 0), ((2, 0), (3, 1), (4, 1), (2, 1))), ((1188, 1403), (30113960, 32950053, 2.836, 0), ((2, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((1403, 1943), (32950053, 36891858, 3.942, 0), ((2, 0), (3, 1), (4, 1), (2, 1))), ((1943, 2053), (36891858, 37982012, 1.090, 0), ((2, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((2053, 2055), (37982012, 38086574, 0.105, 0), ((2, 0), (3, 1), (4, 1), (2, 1))), ((2055, 2133), (38086574, 39454432, 1.368, 0), ((2, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((2133, 2174), (39454432, 40018212, 0.564, 0), ((2, 0), (3, 1), (4, 1), (2, 1))), ((2174, 2221), (40018212, 41107688, 1.089, 0), ((2, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((2221, 2612), (41107688, 45515269, 4.408, 0), ((2, 0), (3, 1), (4, 1), (2, 1))), ((2612, 2661), (45515269, 45972017, 0.457, 0), ((2, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((2661, 2735), (45972017, 47094390, 1.122, 0), ((0, 1), (0, 0), (3, 1), (2, 1), (2, 0), (1, 0), (4, 1), (1, 1), (4, 0))), ((2735, 2945), (47094390, 48569604, 1.475, 0), ((2, 0), (0, 0), (1, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((2945, 2991), (48569604, 48741583, 0.172, 0), ((2, 0), (0, 0), (1, 0), (3, 1), (4, 1), (4, 0))), ((2991, 3127), (48741583, 49752332, 1.011, 0), ((2, 0), (0, 0), (1, 0), (3, 1), (4, 1), (2, 1), (4, 0))), ((3127, 3177), (49752332, 50120255, 0.368, 0), ((2, 0), (1, 0), (3, 1), (4, 1), (0, 0))), ((3177, 3218), (50120255, 51156934, 1.037, 0), ((0, 1), (0, 0), (3, 1), (2, 1), (2, 0), (1, 0), (4, 1), (1, 1), (4, 0)))], decimal=3, err_msg='Wrong IBD segments')