def test_multinomial_elementwise_distribution(self): '''Verify that the created variables approach a multinomial distribution for large numbers of samples.''' (m, n, k) = (6, 5, 1) r = 2 ** np.arange(4, 17) p = statutil.random_row_stochastic((m, n)) #p = statutil.scale_row_sums(np.ones((m, n))) error = np.zeros((len(r),)) for (i, r_val) in enumerate(r): for _ in xrange(k): x = statutil.multinomial_elementwise(p, r_val) # Root-mean-square-error of observed frequencies w.r.t. desired frequencies error[i] += statutil.norm_frobenius_scaled(statutil.hist(x, n) / (1.0 * r_val) - p) error[i] /= (1.0 * k) # Validate the model error of the central limit theorem: C*r^(-0.5). # This is a consequence of the Central Limit Theorem. We are making k experiments for # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors # should be (with 95% confidence) <= n * (1.96*s[i])^2. So # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m). # See http://en.wikipedia.org/wiki/Central_limit_theorem alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error)) c = np.exp(c) # print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - # np.sum(p * np.arange(p.shape[1]), axis=1) ** 2, # 2) / np.sqrt(p.shape[0]), assert_almost_equal(alpha, -0.5, decimal=1, err_msg='Unexpected error term growth power') self.assertTrue(c <= 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - np.sum(p * np.arange(p.shape[1]), axis=1) ** 2, 2) / np.sqrt(p.shape[0]), 'Error term coefficient outside 95% confidence interval') self.assertTrue(abs(r_value) > 0.99, 'Error does not fit a power law in sample size')
def __handle_estimate_genotype_frequencies(self, request): """Estimate genotype frequencies from the genotype data and save them in ProblemInfo.""" # Load problem fields problem = request.problem snp_metadata = problem.info.snp snp_count = snp_metadata["count"] # Recode genotypes to a single number r = recode.recode_single_genotype(problem.genotype.data) # Count genotype appearances for each SNP, and save in SNP annotation array. # The frequency table column order matches the GENOTYPE_CODE array. This includes filled # and missing genotypes: (1,1),(1,2),(2,2),(0,0). for col, genotype_code in enumerate(recode.GENOTYPE_CODE.itervalues()): snp_count[:, col] = statutil.hist(np.where(r == genotype_code)[0], problem.num_snps) # Calculate frequencies snp_metadata["frequency"] = statutil.scale_row_sums(snp_count.astype("float")) return False
def test_multinomial_elementwise_distribution(self): '''Verify that the created variables approach a multinomial distribution for large numbers of samples.''' (m, n, k) = (6, 5, 1) r = 2**np.arange(4, 17) p = statutil.random_row_stochastic((m, n)) #p = statutil.scale_row_sums(np.ones((m, n))) error = np.zeros((len(r), )) for (i, r_val) in enumerate(r): for _ in xrange(k): x = statutil.multinomial_elementwise(p, r_val) # Root-mean-square-error of observed frequencies w.r.t. desired frequencies error[i] += statutil.norm_frobenius_scaled( statutil.hist(x, n) / (1.0 * r_val) - p) error[i] /= (1.0 * k) # Validate the model error of the central limit theorem: C*r^(-0.5). # This is a consequence of the Central Limit Theorem. We are making k experiments for # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors # should be (with 95% confidence) <= n * (1.96*s[i])^2. So # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m). # See http://en.wikipedia.org/wiki/Central_limit_theorem alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error)) c = np.exp(c) # print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - # np.sum(p * np.arange(p.shape[1]), axis=1) ** 2, # 2) / np.sqrt(p.shape[0]), assert_almost_equal(alpha, -0.5, decimal=1, err_msg='Unexpected error term growth power') self.assertTrue( c <= 1.96 * np.linalg.linalg.norm( np.sum(p * np.arange(p.shape[1])**2, axis=1) - np.sum(p * np.arange(p.shape[1]), axis=1)**2, 2) / np.sqrt(p.shape[0]), 'Error term coefficient outside 95% confidence interval') self.assertTrue( abs(r_value) > 0.99, 'Error does not fit a power law in sample size')
def __handle_estimate_genotype_frequencies(self, request): '''Estimate genotype frequencies from the genotype data and save them in ProblemInfo.''' # Load problem fields problem = request.problem snp_metadata = problem.info.snp snp_count = snp_metadata['count'] # Recode genotypes to a single number r = recode.recode_single_genotype(problem.genotype.data) # Count genotype appearances for each SNP, and save in SNP annotation array. # The frequency table column order matches the GENOTYPE_CODE array. This includes filled # and missing genotypes: (1,1),(1,2),(2,2),(0,0). for col, genotype_code in enumerate(recode.GENOTYPE_CODE.itervalues()): snp_count[:, col] = statutil.hist( np.where(r == genotype_code)[0], problem.num_snps) # Calculate frequencies snp_metadata['frequency'] = statutil.scale_row_sums( snp_count.astype('float')) return False