def testUnitBootstrap(self): # The bootstrap depends upon random values to work; thus, we'll # only check that it's statistically close to a simulated value. # We set the seed to avoid flaky tests; this test will fail with # probability 0.05 otherwise. # Note this is an equivalent problem to the testBootstrap case, # we've just split some rows. np.random.seed(12345) x = [] y = [] for ii in range(1, 101): for _ in range(3): x.append(ii) y.append(ii) data = pd.DataFrame({"X": x, "Y": y}) metric = metrics.Mean("X") se_method = standard_errors.Bootstrap(100, unit="Y") output = core.Analyze(data).with_standard_errors(se_method).calculate( metric).run() bootstrap_se = output["mean(X) Bootstrap SE"].values[0] simulation_se = 2.88 epsilon = 0.41 # Two standard errors based on simulation. self.assertAlmostEqual(simulation_se, bootstrap_se, delta=epsilon)
def testBootstrap(self): # The bootstrap depends upon random values to work; thus, we'll # only check that it's statistically close to the theoretical # value. # We set the seed to avoid flaky tests; this test will fail with # probability 0.05 otherwise. np.random.seed(12345) data = pd.DataFrame({"X": range(1, 101)}) metric = metrics.Mean("X") se_method = standard_errors.Bootstrap(100) output = core.Analyze(data).with_standard_errors(se_method).calculate( metric).run() bootstrap_se = output["mean(X) Bootstrap SE"].values[0] # Parameters based on the following R simulation # set.seed(12345) # library(bootstrap) # x <- 1:100 # estimates <- replicate(1000, sd(bootstrap(x, 100, mean)$thetastar)) # mean(estimates) # sd(estimates) simulation_se = 2.88 epsilon = 0.41 # Two standard errors based on simulation. self.assertAlmostEqual(simulation_se, bootstrap_se, delta=epsilon)
def testMeanWithWeights(self): df = pd.DataFrame({"X": [1, 2, 3, 4]}) weights = np.array([3, 2, 1, 1]) metric = metrics.Mean("X") output = metric(df, weights) correct = 2.0 self.assertEqual(output, correct)
def testMultipleCalculations(self): data = pd.DataFrame({"X": [1, 2, 3, 4, 5]}) output = core.Analyze(data).calculate( [metrics.Sum("X"), metrics.Mean("X")]).run() correct = pd.DataFrame(np.array([[15, 3.0]]), columns=["sum(X)", "mean(X)"]) correct[["sum(X)"]] = correct[["sum(X)"]].astype(int) self.assertTrue(output.equals(correct))