Example #1
0
def concatenate_segments(segments):
    '''Concatenate segments and errors from a generator expression.'''
    (all_segments, all_errors) = ([], gt.empty_errors_array())
    for s in segments:
        all_segments += s
        all_errors = gt.concatenate_errors(all_errors, s.errors)
    return SegmentSet(all_segments, all_errors)
Example #2
0
def phase_at_snp_using_training_set(snp_index, ibd, h, t, training_sample_set):
    '''Phase using IBD segments containing the training_sample_set.'''
    errors = empty_errors_array()
    bp = t.snp['base_pair'][snp_index]
    segments = im.imputation.ibd_lookup.segments_ibd_at_bp(ibd, (bp, bp), samples=training_sample_set)
    for segment in segments:
        s = np.array(list(segment.samples))
        hap_ids = np.array([x[0] for x in segment.samples])
        i = np.in1d(hap_ids, training_sample).nonzero()[0]
        (i1, i2) = s[i].transpose()
        alleles = h[snp_index, i1, i2]
        known = alleles.nonzero()[0]
        if known.size:
            # There exist phased training samples
            hap_ids = hap_ids[i[known]]
            alleles = alleles[known]
            if np.diff(alleles).nonzero()[0]:
                # Contradicting alleles, flag errors
                print 'Error: contradicting alleles at snp index', snp_index, 'hap_ids', hap_ids, 'alleles', alleles
                e = np.zeros((2, len(hap_ids)), dtype=np.uint)
                e[0, :] = snp_index
                e[1, :] = hap_ids                
                errors = np.concatenate((errors, e), axis=1)
            elif alleles.size:
                # Copy known alleles to all IBD haplotypes' h-entries
                (i1, i2) = s.transpose()
                print 'phasing at i1', i1, 'i2', i2, 'allele value', alleles[0]
                h[snp_index, i1, i2] = alleles[0]
    return errors
Example #3
0
def phase_at_snp_using_training_set(snp_index, ibd, h, t, training_sample_set):
    '''Phase using IBD segments containing the training_sample_set.'''
    errors = empty_errors_array()
    bp = t.snp['base_pair'][snp_index]
    segments = im.imputation.ibd_lookup.segments_ibd_at_bp(
        ibd, (bp, bp), samples=training_sample_set)
    for segment in segments:
        s = np.array(list(segment.samples))
        hap_ids = np.array([x[0] for x in segment.samples])
        i = np.in1d(hap_ids, training_sample).nonzero()[0]
        (i1, i2) = s[i].transpose()
        alleles = h[snp_index, i1, i2]
        known = alleles.nonzero()[0]
        if known.size:
            # There exist phased training samples
            hap_ids = hap_ids[i[known]]
            alleles = alleles[known]
            if np.diff(alleles).nonzero()[0]:
                # Contradicting alleles, flag errors
                print 'Error: contradicting alleles at snp index', snp_index, 'hap_ids', hap_ids, 'alleles', alleles
                e = np.zeros((2, len(hap_ids)), dtype=np.uint)
                e[0, :] = snp_index
                e[1, :] = hap_ids
                errors = np.concatenate((errors, e), axis=1)
            elif alleles.size:
                # Copy known alleles to all IBD haplotypes' h-entries
                (i1, i2) = s.transpose()
                print 'phasing at i1', i1, 'i2', i2, 'allele value', alleles[0]
                h[snp_index, i1, i2] = alleles[0]
    return errors
Example #4
0
def concatenate_segments(segments):
    '''Concatenate segments and errors from a generator expression.'''
    (all_segments, all_errors) = ([], gt.empty_errors_array())
    for s in segments:
        all_segments += s
        all_errors = gt.concatenate_errors(all_errors, s.errors)
    return SegmentSet(all_segments, all_errors)
Example #5
0
def genotype_ibs_segments(genotype,
                          id1,
                          id2,
                          snps,
                          error_filter='median',
                          error_filter_length=5,
                          margin=0.0,
                          min_ibs_len_snp=400,
                          debug=False):
    '''Return Identical-by-State (IBS >= 1) segments between two genoypes of samples id1 and id2
    in the SNP range [snp[0],snp[1]) (if snp is a tuple) or the subset of SNPs, if snps is an array.
    
    See ibs_segments() for a description of optional parameters.'''
    num_snps = genotype.num_snps
    g = genotype.data
    g1 = recode.recode_single_genotype(g[snps, id1, :])
    g2 = recode.recode_single_genotype(g[snps, id2, :])
    d = (recode.ibs_state(g1, g2) == 0).astype(np.byte)

    # Consider informative or the specified SNPs only
    filtered_diff = filter_diff(d, error_filter, error_filter_length)
    error_snps = snps[np.nonzero(d - filtered_diff)[0]]

    # Detect edges as non-zero gradient points; output sufficiently long segments
    if np.size(filtered_diff) == 0:
        # No data to consider ==> no IBD intervals can be identified
        segments = []
    else:
        # Convert recombination locations to segments of no recombination; filter short segments
        bp = genotype.snp['base_pair']
        #print segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp)
        segments = [
            Segment(((x[0], x[1])), [id1, id2],
                    (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)),
                    error_snps=segment.in_segment(error_snps, x),
                    collapse_to_set=False) for x in
            segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp)
        ]

    # Cut segment margins
    if margin >= constants.SMALL_FLOAT:
        segments = [
            s for s in (s.middle_part(
                genotype.nearest_snp, bp, margin, collapse_to_set=False)
                        for s in segments) if s
        ]

    # Restrict errors to those inside segments
    segment_set = SegmentSet(segments,
                             np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)),
                                                              np.array([id1, id2])), dtype=int) \
                             if segments else gt.empty_errors_array())
    if debug:
        print 'ibs_segments()', segment_set
        print 'errors', segment_set.errors
    return segment_set
Example #6
0
 def test_ibd_segments_ibdld(self):
     '''Calculate IBD segments in a nuclear family using IBDLD.'''
     segment_cache = im.ibdld.ibd_ld.IbdSegmentGlobalCacheIbdld(itu.FAMILY7 + '.ibd')
     segment_computer = im.ibdld.ibd_ld.IbdSegmentComputerIbdld(segment_cache, self.haplotype,
                                                       chrom=22,
                                                       sample_id=self.problem.pedigree.sample_id,
                                                       samples=[2, 8],
                                                       threshold=0.9,
                                                       params=PhaseParam())
     segment_set = segment.break_and_group_segments(segment_computer.segments)
     assert_segments_almost_equal(segment_set,
                                  [((38  , 2849), [], ((8, 1), (2, 1)))],
                                  full_data=False, decimal=3, err_msg='Wrong grouped IBDLD IBD segments')
     assert_equal(segment_set.errors, empty_errors_array(), 'IBDLD does not support errors but they are output?!')
Example #7
0
def _find_errors(problem, snps, haps, consensus, common, hh, min_consensus_samples):
    e = _get_current_error_status(problem, haps, snps)
#    if debug and snp_test_index and start <= snp_test_index and snp_test_index <= stop:
#        print 'Haps at SNP %d before phasing:' % (snp_test_index,)
#        np.set_printoptions(threshold=np.nan)
#        ind = np.array(haps)
#        print h[max(start, snp_test_index - 5):min(stop, snp_test_index + 5), ind[:, 0], ind[:, 1]] 
    if consensus == 'majority' and hh.shape[0] >= min_consensus_samples:
        # errors = np.where(np.tile(common, (num_haps,1)) != hh)
        # Convert to original coordinates: (SNP, sample)
        errors = _find_new_errors(common, hh, e)
        errors = (snps[errors[1]], haps[errors[0], 0]) if errors[0].size else errors
    else:
        errors = gt.empty_errors_array()
    return errors
Example #8
0
def _find_errors(problem, snps, haps, consensus, common, hh,
                 min_consensus_samples):
    e = _get_current_error_status(problem, haps, snps)
    #    if debug and snp_test_index and start <= snp_test_index and snp_test_index <= stop:
    #        print 'Haps at SNP %d before phasing:' % (snp_test_index,)
    #        np.set_printoptions(threshold=np.nan)
    #        ind = np.array(haps)
    #        print h[max(start, snp_test_index - 5):min(stop, snp_test_index + 5), ind[:, 0], ind[:, 1]]
    if consensus == 'majority' and hh.shape[0] >= min_consensus_samples:
        # errors = np.where(np.tile(common, (num_haps,1)) != hh)
        # Convert to original coordinates: (SNP, sample)
        errors = _find_new_errors(common, hh, e)
        errors = (snps[errors[1]], haps[errors[0],
                                        0]) if errors[0].size else errors
    else:
        errors = gt.empty_errors_array()
    return errors
Example #9
0
def genotype_ibs_segments(genotype, id1, id2, snps,
                          error_filter='median', error_filter_length=5, margin=0.0,
                          min_ibs_len_snp=400, debug=False):
    '''Return Identical-by-State (IBS >= 1) segments between two genoypes of samples id1 and id2
    in the SNP range [snp[0],snp[1]) (if snp is a tuple) or the subset of SNPs, if snps is an array.
    
    See ibs_segments() for a description of optional parameters.'''
    num_snps = genotype.num_snps
    g = genotype.data
    g1 = recode.recode_single_genotype(g[snps, id1, :])
    g2 = recode.recode_single_genotype(g[snps, id2, :])
    d = (recode.ibs_state(g1, g2) == 0).astype(np.byte)

    # Consider informative or the specified SNPs only
    filtered_diff = filter_diff(d, error_filter, error_filter_length)
    error_snps = snps[np.nonzero(d - filtered_diff)[0]]
    
    # Detect edges as non-zero gradient points; output sufficiently long segments
    if np.size(filtered_diff) == 0:
        # No data to consider ==> no IBD intervals can be identified
        segments = []
    else:
        # Convert recombination locations to segments of no recombination; filter short segments
        bp = genotype.snp['base_pair']
        #print segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp)
        segments = [Segment(((x[0], x[1])), [id1, id2], (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)),
                            error_snps=segment.in_segment(error_snps, x), collapse_to_set=False)
                    for x in segment.segments_with_value(filtered_diff, 0, min_ibs_len_snp)]
    
    # Cut segment margins
    if margin >= constants.SMALL_FLOAT:
        segments = [s for s in (s.middle_part(genotype.nearest_snp, bp, margin, collapse_to_set=False)
                                for s in segments) if s]

    # Restrict errors to those inside segments
    segment_set = SegmentSet(segments,
                             np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)),
                                                              np.array([id1, id2])), dtype=int) \
                             if segments else gt.empty_errors_array())
    if debug:
        print 'ibs_segments()', segment_set
        print 'errors', segment_set.errors
    return segment_set
Example #10
0
def ibs_segments(haplotype,
                 id1,
                 id2,
                 hap1_type,
                 hap2_type,
                 snps=None,
                 include_alt_phase=False,
                 error_filter='median',
                 error_filter_length=5,
                 length_bound=None,
                 min_segment_length=INDETERMINATE,
                 margin=0.0,
                 debug=False):
    '''Return 1) Identical-by-State (IBS) segments separated by recombination events between two
    sample haplotypes (id1, hap1_type) and (id2, hap2_type). The 2-D output array's ith row format is
    
    (segment_start, segment_stop),
    (id1, hap1), (id2, hap2), 
    (segment_start_bp, segment_stop_bp, segment_length_in_bp, num_errors_in_segment) 
    
    The SNP range is [segment_start, segment_stop) where start=inclusive and stop is exclusive.
    2) List of het_snp indices at which there are likely genotype errors.
        
    Options:
    snps - list of SNPs to base the comparison on. For parent-child comparisons, these should
    be heterozygous SNPs in the parent's genotype, distinguishing its haplotypes
    and used to locate segments. For unphased-phased individuals, these should be the list of
    homozygous SNPs at the unphased individual (those that have data).
    If not specified, all SNPs are used.
    
    length_bound - minimum segment length bound type:
        None: no lower bound enforced 
        'base_pair': output segments of at least min_segment_length [base pair]
        'snp': output segments of at least min_segment_length consecutive SNPs out of the snps list.
               This is useful only if snps includes all SNPs (or is None) 
        *NOTE*: min_segment_length''s units are interpreted differently depending on length_bound.
         
    margin = fraction of segment to discard near the endpoints (margin/2 is removed from each side).'''

    if debug:
        print 'Computing IBD segments between haplotypes (%d,%d), (%d,%d); filter %s length %d' % \
        (id1, hap1_type, id2, hap2_type, error_filter, error_filter_length)
    d = diff.all_diffs(haplotype.data,
                       id1,
                       id2,
                       hap1_type=hap1_type,
                       hap2_type=hap2_type)[0]
    # Segment length, as defined by the input parameters
    segment_length = lambda f: np.inf if not length_bound else (
        f.length
        if length_bound == 'base_pair' else f.num_snps)  # @UnusedVariable

    # Consider informative or the specified SNPs only
    snps = snps if snps is not None else haplotype.snp_range
    snps = np.intersect1d(snps, np.where(d != INDETERMINATE)[0])
    d_snps = d[snps]
    filtered_diff = filter_diff(d_snps, error_filter, error_filter_length)
    error_snps = snps[np.nonzero(d_snps - filtered_diff)[0]]

    # Detect edges as non-zero gradient points; output sufficiently long segments
    bp = haplotype.snp['base_pair']
    num_snps = haplotype.num_snps
    if np.size(filtered_diff) == 0:
        # No data to consider ==> no IBD intervals can be identified
        segments = []
    else:
        deriv = ndimage.convolve(filtered_diff, [1, -1])
        edge = np.where(deriv != 0)[0]
        initial_phase = hap1_type if filtered_diff[0] == 0 else 1 - hap1_type
        if debug:
            print 'initial_phase', initial_phase  # , 'edge', edge
        # Convert recombination locations to segments of no recombination; filter short segments
        segments = [
            f for f in (
                Segment(((x[0], x[1])),
                        set(((id1, x[2]), (id2, hap2_type))), (
                            bp[x[0]], segment.stop_bp(bp, x[1], num_snps)),
                        error_snps=segment.in_segment(error_snps, x))
                for x in segment.edges_to_segments(
                    snps, edge, initial_phase, haplotype.num_snps, hap1_type))
            if segment_length(f) >= min_segment_length
        ]

    # Cut segment margins
    if margin >= constants.SMALL_FLOAT:
        segments = [
            s for s in (s.middle_part(haplotype.nearest_snp, bp, margin)
                        for s in segments) if s
        ]

    # Restrict errors to those inside segments
    segment_set = SegmentSet(segments,
                             np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)),
                                                                        np.array([id1, id2])), dtype=int) \
                             if segments else gt.empty_errors_array())
    if debug:
        print 'ibs_segments()', segment_set
        print 'errors', segment_set.errors
    return segment_set
Example #11
0
    
    # Allocate imputed data structures
    imputed = im.factory.GenotypeFactory.new_instance('haplotype',
                                                      np.zeros((t.num_snps, pedigree.num_genotyped, 2), dtype=np.byte),
                                                      t.snp, t.sample_id)
    h = imputed.data
    
    # Phase all homozygous trainees; place in corresponding locations in h
    hom = np.where(im.gt.is_homozygous(tg)[:, :])
    h[hom[0], training_sample[hom[1]], :] = tg[hom]
    
    # Phase training set. For each SNP:
    # - Find all segments intersecting the SNP
    # - Construct global IBD segments-sets. This is currently a wasteful implementation. TODO: replace ibd by
    #   the global IBD dictionary to speed this part up
    # - Copy known alleles to 
    # - If known alleles differ, mark errors (at all alleles for now; TODO: use instead majority vote to find a single error among multiple alleles)
    errors = empty_errors_array()
    for snp_index in xrange(t.num_snps):
        chrom = t.snp['chrom'][snp_index]
        print '====== SNP %d: chr%d:%d, %s ======' % (snp_index, chrom, t.snp['base_pair'][snp_index],
                                                      t.snp['name'][snp_index])
        ibd_chrom = ibd[chrom - 1]
        # Phase using homozygous training samples 
        errors = merge_errors(errors, phase_at_snp_using_training_set(snp_index, ibd_chrom, h, t, training_sample_set))
        # Phase using het training samples that now have one allele determined          
        hh = h[snp_index, training_sample, :]
        het_to_phase = training_sample[np.where(((hh[:, 0] == MISSING) ^ (hh[:, 1] == MISSING)) & 
                                                (tg[snp_index, :, 0] != MISSING) & (tg[snp_index, :, 1] != MISSING))[0]]
        errors = merge_errors(errors, phase_at_snp_using_training_set(snp_index, ibd_chrom, h, t, het_to_phase))
Example #12
0
        'haplotype',
        np.zeros((t.num_snps, pedigree.num_genotyped, 2), dtype=np.byte),
        t.snp, t.sample_id)
    h = imputed.data

    # Phase all homozygous trainees; place in corresponding locations in h
    hom = np.where(im.gt.is_homozygous(tg)[:, :])
    h[hom[0], training_sample[hom[1]], :] = tg[hom]

    # Phase training set. For each SNP:
    # - Find all segments intersecting the SNP
    # - Construct global IBD segments-sets. This is currently a wasteful implementation. TODO: replace ibd by
    #   the global IBD dictionary to speed this part up
    # - Copy known alleles to
    # - If known alleles differ, mark errors (at all alleles for now; TODO: use instead majority vote to find a single error among multiple alleles)
    errors = empty_errors_array()
    for snp_index in xrange(t.num_snps):
        chrom = t.snp['chrom'][snp_index]
        print '====== SNP %d: chr%d:%d, %s ======' % (
            snp_index, chrom, t.snp['base_pair'][snp_index],
            t.snp['name'][snp_index])
        ibd_chrom = ibd[chrom - 1]
        # Phase using homozygous training samples
        errors = merge_errors(
            errors,
            phase_at_snp_using_training_set(snp_index, ibd_chrom, h, t,
                                            training_sample_set))
        # Phase using het training samples that now have one allele determined
        hh = h[snp_index, training_sample, :]
        het_to_phase = training_sample[
            np.where(((hh[:, 0] == MISSING) ^ (hh[:, 1] == MISSING))
Example #13
0
def ibs_segments(haplotype, id1, id2, hap1_type, hap2_type, snps=None, include_alt_phase=False,
                 error_filter='median', error_filter_length=5,
                 length_bound=None, min_segment_length=INDETERMINATE, margin=0.0, debug=False):
    '''Return 1) Identical-by-State (IBS) segments separated by recombination events between two
    sample haplotypes (id1, hap1_type) and (id2, hap2_type). The 2-D output array's ith row format is
    
    (segment_start, segment_stop),
    (id1, hap1), (id2, hap2), 
    (segment_start_bp, segment_stop_bp, segment_length_in_bp, num_errors_in_segment) 
    
    The SNP range is [segment_start, segment_stop) where start=inclusive and stop is exclusive.
    2) List of het_snp indices at which there are likely genotype errors.
        
    Options:
    snps - list of SNPs to base the comparison on. For parent-child comparisons, these should
    be heterozygous SNPs in the parent's genotype, distinguishing its haplotypes
    and used to locate segments. For unphased-phased individuals, these should be the list of
    homozygous SNPs at the unphased individual (those that have data).
    If not specified, all SNPs are used.
    
    length_bound - minimum segment length bound type:
        None: no lower bound enforced 
        'base_pair': output segments of at least min_segment_length [base pair]
        'snp': output segments of at least min_segment_length consecutive SNPs out of the snps list.
               This is useful only if snps includes all SNPs (or is None) 
        *NOTE*: min_segment_length''s units are interpreted differently depending on length_bound.
         
    margin = fraction of segment to discard near the endpoints (margin/2 is removed from each side).'''
    
    if debug:
        print 'Computing IBD segments between haplotypes (%d,%d), (%d,%d); filter %s length %d' % \
        (id1, hap1_type, id2, hap2_type, error_filter, error_filter_length)
    d = diff.all_diffs(haplotype.data, id1, id2, hap1_type=hap1_type, hap2_type=hap2_type)[0]
    # Segment length, as defined by the input parameters 
    segment_length = lambda f: np.inf if not length_bound else (f.length if length_bound == 'base_pair' else f.num_snps)  # @UnusedVariable
    
    # Consider informative or the specified SNPs only
    snps = snps if snps is not None else haplotype.snp_range
    snps = np.intersect1d(snps, np.where(d != INDETERMINATE)[0])
    d_snps = d[snps]
    filtered_diff = filter_diff(d_snps, error_filter, error_filter_length)    
    error_snps = snps[np.nonzero(d_snps - filtered_diff)[0]]
    
    # Detect edges as non-zero gradient points; output sufficiently long segments
    bp = haplotype.snp['base_pair']
    num_snps = haplotype.num_snps
    if np.size(filtered_diff) == 0:
        # No data to consider ==> no IBD intervals can be identified
        segments = []
    else:
        deriv = ndimage.convolve(filtered_diff, [1, -1])    
        edge = np.where(deriv != 0)[0]
        initial_phase = hap1_type if filtered_diff[0] == 0 else 1 - hap1_type
        if debug:
            print 'initial_phase', initial_phase  # , 'edge', edge
        # Convert recombination locations to segments of no recombination; filter short segments
        segments = [f for f in (Segment(((x[0], x[1])), set(((id1, x[2]), (id2, hap2_type))),
                                        (bp[x[0]], segment.stop_bp(bp, x[1], num_snps)),
                                        error_snps=segment.in_segment(error_snps, x))
                                for x in segment.edges_to_segments(snps, edge, initial_phase,
                                                                   haplotype.num_snps, hap1_type))
                    if segment_length(f) >= min_segment_length]
    
    # Cut segment margins
    if margin >= constants.SMALL_FLOAT:
        segments = [s for s in (s.middle_part(haplotype.nearest_snp, bp, margin) for s in segments) if s]
    
    # Restrict errors to those inside segments
    segment_set = SegmentSet(segments,
                             np.array(util.flattened_meshgrid(reduce(list.__add__, (s.error_snps.tolist() for s in segments)),
                                                                        np.array([id1, id2])), dtype=int) \
                             if segments else gt.empty_errors_array())
    if debug:
        print 'ibs_segments()', segment_set
        print 'errors', segment_set.errors
    return segment_set
Example #14
0
 def __init__(self, segments=None, errors=empty_errors_array()):
     '''Initialize a segment set.'''
     SegmentComposite.__init__(self, segments)
     self.errors = errors