Ejemplo n.º 1
0
class Experiment(object):
    '''A validation experiment: start with a Problem object, clear a certain portion of the
    data, run a phaser, and cross-check the hap results against the original genotype data.'''
    #---------------------------------------------
    # Constructors
    #---------------------------------------------
    def __init__(self, problem, fraction=None, test_index=None):
        '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index
        is specified, these specific test indices are used; otherwise a random fraction is generated.
        
        If test_index = 'hap', data is read from problem.h (haplotype array). The entire array
        is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.'''
        # Create a working copy of the problem. Only the data is copied.
        if not (fraction is not None) ^ (test_index is not None):
            raise ValueError('Must specify fraction or test_index')
        self.problem = Problem(problem.pedigree, problem.genotype.copy())
        self.h = self.problem.h
        
        # Create test set; save original genotypes in g_orig
        if test_index is None:
            self.fraction = fraction
            self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction)
        elif test_index == 'hap':
            # Don't clear anything; call everything a test index.
            h = problem.h
            i = tuple(util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1])))
            self.g_orig = problem.g
            self.h = h
            self.fraction = 1.0
        else:
            self.g_orig, i = clear_index(self.problem.g, test_index)
            self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1])
        self.num_tests = i[0].size
        self.test_index = i
        self.r_orig = recode.recode_single_genotype(self.g_orig)
        self.fill = self.problem.fill_fraction()[:, SAMPLE]
        self.__recode_single_genotype = None
    
    #---------------------------------------------
    # Methods
    #---------------------------------------------
    def __repr__(self):
        return 'Experiment[%s, fraction=%.2f%%]' % (repr(self.problem), self.fraction) 
    
    def run(self, phaser, params=None):
        '''Run phaser (or more generally, a handler) on a problem.'''
        phaser.run(self.problem, params=params)
        self.fill = self.problem.fill_fraction()[:, 1]
    
    def num_test_genotypes(self, field):
        '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).'''
        return self.__group_by_field(np.arange(len(self.test_index[0])), field)
    
    def where_called(self):
        '''Return the indices of genotypes in which both alleles were called.''' 
        # Positive entries of r = called entries
        return recode.where_called(self.recoded_genotype)[0]

    def called(self, field):
        '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).'''
        return self.__group_by_field(self.where_called(), field)

    #---------------------------------------------
    # Properties
    #---------------------------------------------
    @property
    def test_orig(self):
        '''Return the original set of deleted test genotypes.'''
        i = self.test_index
        return recode.recode_single_genotype(self.h[i[SNP], i[SAMPLE], :])

    @property
    def test_called(self):
        '''Return the called set of haplotypes corersponding to the test genotypes.'''
        i = self.test_index
        return self.h[i[SNP], i[SAMPLE], :]
    
    @property
    def recoded_genotype(self):
        '''Return the genotype test set, recoded as a single number of allele pair.'''
        if self.__recode_single_genotype is None:
            self.__recode_single_genotype = recode.recode_single_genotype(self.test_called) 
        return self.__recode_single_genotype

    @property    
    def total_called(self):
        '''Return the number of genotypes in which both alleles were called.''' 
        return len(self.where_called())

    @property    
    def total_partial_called(self):
        '''Return the number of genotypes in which one alleles was called.''' 
        # Positive entries of r = called entries
        return len(recode.where_partial_called(self.recoded_genotype)[0])
    
    @property    
    def total_errors(self):
        '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)''' 
        # Count entries that were called (positive AND are different than the corresponding original value
        return len(recode.where_error(self.recoded_genotype, self.r_orig)[0])
    
    @property    
    def total_partial_errors(self):
        '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)''' 
        # Count entries that were called (positive AND are different than the corresponding original value
        # This happens when hap=(0,1), genotype=(2,2) or hap(0,2), genotype=(1,1)
        return len(recode.where_partial_error(self.recoded_genotype, self.r_orig)[0])

    @property
    def full_call_fraction(self):
        '''Return the % of correctly fully-called test genotypes.''' 
        return (1.0 * self.total_called) / self.num_tests
    
    @property
    def partial_call_fraction(self):
        '''Return the % of erroneously half-called test genotypes.''' 
        return (1.0 * self.total_partial_called) / self.num_tests

    @property
    def full_error_fraction(self):
        '''Return the % of erroneously fully-called test genotypes.''' 
        return (1.0 * self.total_errors) / self.num_tests

    @property
    def partial_error_fraction(self):
        '''Return the % of erroneously half-called test genotypes.''' 
        return (1.0 * self.total_partial_errors) / self.num_tests

    @property
    def stats(self):
        '''Return a tuple containing all experiment statistics: fraction, all call rates, run time.'''
        return (self.fraction,
                self.full_call_fraction,
                self.partial_call_fraction,
                self.full_error_fraction,
                self.partial_error_fraction)
        
    #---------------------------------------------
    # Private Methods
    #---------------------------------------------
    def __group_by_field(self, i, field):
        '''Group a test index subset i by field (SNP=0, sample=1).'''
        size = self.problem.genotype.data.shape[field]
        group_count = util.dict_to_array(statutil.group_by_value(self.test_index[field][i]))
        result = np.zeros((size,), dtype=int)
        result[group_count['k']] = group_count['v']
        return result
Ejemplo n.º 2
0
class Experiment(object):
    '''A validation experiment: start with a Problem object, clear a certain portion of the
    data, run a phaser, and cross-check the hap results against the original genotype data.'''

    #---------------------------------------------
    # Constructors
    #---------------------------------------------
    def __init__(self, problem, fraction=None, test_index=None):
        '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index
        is specified, these specific test indices are used; otherwise a random fraction is generated.
        
        If test_index = 'hap', data is read from problem.h (haplotype array). The entire array
        is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.'''
        # Create a working copy of the problem. Only the data is copied.
        if not (fraction is not None) ^ (test_index is not None):
            raise ValueError('Must specify fraction or test_index')
        self.problem = Problem(problem.pedigree, problem.genotype.copy())
        self.h = self.problem.h

        # Create test set; save original genotypes in g_orig
        if test_index is None:
            self.fraction = fraction
            self.g_orig, i = clear_random_portion(self.problem.genotype.data,
                                                  fraction)
        elif test_index == 'hap':
            # Don't clear anything; call everything a test index.
            h = problem.h
            i = tuple(
                util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1])))
            self.g_orig = problem.g
            self.h = h
            self.fraction = 1.0
        else:
            self.g_orig, i = clear_index(self.problem.g, test_index)
            self.fraction = (1.0 * i[0].size) / (self.h.shape[0] *
                                                 self.h.shape[1])
        self.num_tests = i[0].size
        self.test_index = i
        self.r_orig = recode.recode_single_genotype(self.g_orig)
        self.fill = self.problem.fill_fraction()[:, SAMPLE]
        self.__recode_single_genotype = None

    #---------------------------------------------
    # Methods
    #---------------------------------------------
    def __repr__(self):
        return 'Experiment[%s, fraction=%.2f%%]' % (repr(
            self.problem), self.fraction)

    def run(self, phaser, params=None):
        '''Run phaser (or more generally, a handler) on a problem.'''
        phaser.run(self.problem, params=params)
        self.fill = self.problem.fill_fraction()[:, 1]

    def num_test_genotypes(self, field):
        '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).'''
        return self.__group_by_field(np.arange(len(self.test_index[0])), field)

    def where_called(self):
        '''Return the indices of genotypes in which both alleles were called.'''
        # Positive entries of r = called entries
        return recode.where_called(self.recoded_genotype)[0]

    def called(self, field):
        '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).'''
        return self.__group_by_field(self.where_called(), field)

    #---------------------------------------------
    # Properties
    #---------------------------------------------
    @property
    def test_orig(self):
        '''Return the original set of deleted test genotypes.'''
        i = self.test_index
        return recode.recode_single_genotype(self.h[i[SNP], i[SAMPLE], :])

    @property
    def test_called(self):
        '''Return the called set of haplotypes corersponding to the test genotypes.'''
        i = self.test_index
        return self.h[i[SNP], i[SAMPLE], :]

    @property
    def recoded_genotype(self):
        '''Return the genotype test set, recoded as a single number of allele pair.'''
        if self.__recode_single_genotype is None:
            self.__recode_single_genotype = recode.recode_single_genotype(
                self.test_called)
        return self.__recode_single_genotype

    @property
    def total_called(self):
        '''Return the number of genotypes in which both alleles were called.'''
        return len(self.where_called())

    @property
    def total_partial_called(self):
        '''Return the number of genotypes in which one alleles was called.'''
        # Positive entries of r = called entries
        return len(recode.where_partial_called(self.recoded_genotype)[0])

    @property
    def total_errors(self):
        '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)'''
        # Count entries that were called (positive AND are different than the corresponding original value
        return len(recode.where_error(self.recoded_genotype, self.r_orig)[0])

    @property
    def total_partial_errors(self):
        '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)'''
        # Count entries that were called (positive AND are different than the corresponding original value
        # This happens when hap=(0,1), genotype=(2,2) or hap(0,2), genotype=(1,1)
        return len(
            recode.where_partial_error(self.recoded_genotype, self.r_orig)[0])

    @property
    def full_call_fraction(self):
        '''Return the % of correctly fully-called test genotypes.'''
        return (1.0 * self.total_called) / self.num_tests

    @property
    def partial_call_fraction(self):
        '''Return the % of erroneously half-called test genotypes.'''
        return (1.0 * self.total_partial_called) / self.num_tests

    @property
    def full_error_fraction(self):
        '''Return the % of erroneously fully-called test genotypes.'''
        return (1.0 * self.total_errors) / self.num_tests

    @property
    def partial_error_fraction(self):
        '''Return the % of erroneously half-called test genotypes.'''
        return (1.0 * self.total_partial_errors) / self.num_tests

    @property
    def stats(self):
        '''Return a tuple containing all experiment statistics: fraction, all call rates, run time.'''
        return (self.fraction, self.full_call_fraction,
                self.partial_call_fraction, self.full_error_fraction,
                self.partial_error_fraction)

    #---------------------------------------------
    # Private Methods
    #---------------------------------------------
    def __group_by_field(self, i, field):
        '''Group a test index subset i by field (SNP=0, sample=1).'''
        size = self.problem.genotype.data.shape[field]
        group_count = util.dict_to_array(
            statutil.group_by_value(self.test_index[field][i]))
        result = np.zeros((size, ), dtype=int)
        result[group_count['k']] = group_count['v']
        return result