class Experiment(object): '''A validation experiment: start with a Problem object, clear a certain portion of the data, run a phaser, and cross-check the hap results against the original genotype data.''' #--------------------------------------------- # Constructors #--------------------------------------------- def __init__(self, problem, fraction=None, test_index=None): '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index is specified, these specific test indices are used; otherwise a random fraction is generated. If test_index = 'hap', data is read from problem.h (haplotype array). The entire array is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.''' # Create a working copy of the problem. Only the data is copied. if not (fraction is not None) ^ (test_index is not None): raise ValueError('Must specify fraction or test_index') self.problem = Problem(problem.pedigree, problem.genotype.copy()) self.h = self.problem.h # Create test set; save original genotypes in g_orig if test_index is None: self.fraction = fraction self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction) elif test_index == 'hap': # Don't clear anything; call everything a test index. h = problem.h i = tuple(util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1]))) self.g_orig = problem.g self.h = h self.fraction = 1.0 else: self.g_orig, i = clear_index(self.problem.g, test_index) self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1]) self.num_tests = i[0].size self.test_index = i self.r_orig = recode.recode_single_genotype(self.g_orig) self.fill = self.problem.fill_fraction()[:, SAMPLE] self.__recode_single_genotype = None #--------------------------------------------- # Methods #--------------------------------------------- def __repr__(self): return 'Experiment[%s, fraction=%.2f%%]' % (repr(self.problem), self.fraction) def run(self, phaser, params=None): '''Run phaser (or more generally, a handler) on a problem.''' phaser.run(self.problem, params=params) self.fill = self.problem.fill_fraction()[:, 1] def num_test_genotypes(self, field): '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).''' return self.__group_by_field(np.arange(len(self.test_index[0])), field) def where_called(self): '''Return the indices of genotypes in which both alleles were called.''' # Positive entries of r = called entries return recode.where_called(self.recoded_genotype)[0] def called(self, field): '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).''' return self.__group_by_field(self.where_called(), field) #--------------------------------------------- # Properties #--------------------------------------------- @property def test_orig(self): '''Return the original set of deleted test genotypes.''' i = self.test_index return recode.recode_single_genotype(self.h[i[SNP], i[SAMPLE], :]) @property def test_called(self): '''Return the called set of haplotypes corersponding to the test genotypes.''' i = self.test_index return self.h[i[SNP], i[SAMPLE], :] @property def recoded_genotype(self): '''Return the genotype test set, recoded as a single number of allele pair.''' if self.__recode_single_genotype is None: self.__recode_single_genotype = recode.recode_single_genotype(self.test_called) return self.__recode_single_genotype @property def total_called(self): '''Return the number of genotypes in which both alleles were called.''' return len(self.where_called()) @property def total_partial_called(self): '''Return the number of genotypes in which one alleles was called.''' # Positive entries of r = called entries return len(recode.where_partial_called(self.recoded_genotype)[0]) @property def total_errors(self): '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)''' # Count entries that were called (positive AND are different than the corresponding original value return len(recode.where_error(self.recoded_genotype, self.r_orig)[0]) @property def total_partial_errors(self): '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)''' # Count entries that were called (positive AND are different than the corresponding original value # This happens when hap=(0,1), genotype=(2,2) or hap(0,2), genotype=(1,1) return len(recode.where_partial_error(self.recoded_genotype, self.r_orig)[0]) @property def full_call_fraction(self): '''Return the % of correctly fully-called test genotypes.''' return (1.0 * self.total_called) / self.num_tests @property def partial_call_fraction(self): '''Return the % of erroneously half-called test genotypes.''' return (1.0 * self.total_partial_called) / self.num_tests @property def full_error_fraction(self): '''Return the % of erroneously fully-called test genotypes.''' return (1.0 * self.total_errors) / self.num_tests @property def partial_error_fraction(self): '''Return the % of erroneously half-called test genotypes.''' return (1.0 * self.total_partial_errors) / self.num_tests @property def stats(self): '''Return a tuple containing all experiment statistics: fraction, all call rates, run time.''' return (self.fraction, self.full_call_fraction, self.partial_call_fraction, self.full_error_fraction, self.partial_error_fraction) #--------------------------------------------- # Private Methods #--------------------------------------------- def __group_by_field(self, i, field): '''Group a test index subset i by field (SNP=0, sample=1).''' size = self.problem.genotype.data.shape[field] group_count = util.dict_to_array(statutil.group_by_value(self.test_index[field][i])) result = np.zeros((size,), dtype=int) result[group_count['k']] = group_count['v'] return result
class Experiment(object): '''A validation experiment: start with a Problem object, clear a certain portion of the data, run a phaser, and cross-check the hap results against the original genotype data.''' #--------------------------------------------- # Constructors #--------------------------------------------- def __init__(self, problem, fraction=None, test_index=None): '''Initialize an experiment to be run on a problem, clearing out 'fraction' of the data. If test_index is specified, these specific test indices are used; otherwise a random fraction is generated. If test_index = 'hap', data is read from problem.h (haplotype array). The entire array is considered as a test array, but nothing is zeroed out. Useful for phasing result stats.''' # Create a working copy of the problem. Only the data is copied. if not (fraction is not None) ^ (test_index is not None): raise ValueError('Must specify fraction or test_index') self.problem = Problem(problem.pedigree, problem.genotype.copy()) self.h = self.problem.h # Create test set; save original genotypes in g_orig if test_index is None: self.fraction = fraction self.g_orig, i = clear_random_portion(self.problem.genotype.data, fraction) elif test_index == 'hap': # Don't clear anything; call everything a test index. h = problem.h i = tuple( util.flattened_meshgrid(range(h.shape[0]), range(h.shape[1]))) self.g_orig = problem.g self.h = h self.fraction = 1.0 else: self.g_orig, i = clear_index(self.problem.g, test_index) self.fraction = (1.0 * i[0].size) / (self.h.shape[0] * self.h.shape[1]) self.num_tests = i[0].size self.test_index = i self.r_orig = recode.recode_single_genotype(self.g_orig) self.fill = self.problem.fill_fraction()[:, SAMPLE] self.__recode_single_genotype = None #--------------------------------------------- # Methods #--------------------------------------------- def __repr__(self): return 'Experiment[%s, fraction=%.2f%%]' % (repr( self.problem), self.fraction) def run(self, phaser, params=None): '''Run phaser (or more generally, a handler) on a problem.''' phaser.run(self.problem, params=params) self.fill = self.problem.fill_fraction()[:, 1] def num_test_genotypes(self, field): '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).''' return self.__group_by_field(np.arange(len(self.test_index[0])), field) def where_called(self): '''Return the indices of genotypes in which both alleles were called.''' # Positive entries of r = called entries return recode.where_called(self.recoded_genotype)[0] def called(self, field): '''Return the number of genotypes in which both alleles were called, broken by field (SNP=0, sample=1).''' return self.__group_by_field(self.where_called(), field) #--------------------------------------------- # Properties #--------------------------------------------- @property def test_orig(self): '''Return the original set of deleted test genotypes.''' i = self.test_index return recode.recode_single_genotype(self.h[i[SNP], i[SAMPLE], :]) @property def test_called(self): '''Return the called set of haplotypes corersponding to the test genotypes.''' i = self.test_index return self.h[i[SNP], i[SAMPLE], :] @property def recoded_genotype(self): '''Return the genotype test set, recoded as a single number of allele pair.''' if self.__recode_single_genotype is None: self.__recode_single_genotype = recode.recode_single_genotype( self.test_called) return self.__recode_single_genotype @property def total_called(self): '''Return the number of genotypes in which both alleles were called.''' return len(self.where_called()) @property def total_partial_called(self): '''Return the number of genotypes in which one alleles was called.''' # Positive entries of r = called entries return len(recode.where_partial_called(self.recoded_genotype)[0]) @property def total_errors(self): '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)''' # Count entries that were called (positive AND are different than the corresponding original value return len(recode.where_error(self.recoded_genotype, self.r_orig)[0]) @property def total_partial_errors(self): '''Return the number of genotypes that were called incorrectly. (A genotype is an allele pair.)''' # Count entries that were called (positive AND are different than the corresponding original value # This happens when hap=(0,1), genotype=(2,2) or hap(0,2), genotype=(1,1) return len( recode.where_partial_error(self.recoded_genotype, self.r_orig)[0]) @property def full_call_fraction(self): '''Return the % of correctly fully-called test genotypes.''' return (1.0 * self.total_called) / self.num_tests @property def partial_call_fraction(self): '''Return the % of erroneously half-called test genotypes.''' return (1.0 * self.total_partial_called) / self.num_tests @property def full_error_fraction(self): '''Return the % of erroneously fully-called test genotypes.''' return (1.0 * self.total_errors) / self.num_tests @property def partial_error_fraction(self): '''Return the % of erroneously half-called test genotypes.''' return (1.0 * self.total_partial_errors) / self.num_tests @property def stats(self): '''Return a tuple containing all experiment statistics: fraction, all call rates, run time.''' return (self.fraction, self.full_call_fraction, self.partial_call_fraction, self.full_error_fraction, self.partial_error_fraction) #--------------------------------------------- # Private Methods #--------------------------------------------- def __group_by_field(self, i, field): '''Group a test index subset i by field (SNP=0, sample=1).''' size = self.problem.genotype.data.shape[field] group_count = util.dict_to_array( statutil.group_by_value(self.test_index[field][i])) result = np.zeros((size, ), dtype=int) result[group_count['k']] = group_count['v'] return result