def test_all_models_have_pval(self):
     losses = self.losses_df.iloc[:, :20]
     mcs = MCS(losses, 0.05, reps=200)
     mcs.seed(23456)
     mcs.compute()
     nan_locs = np.isnan(mcs.pvalues.iloc[:,0])
     assert_true(not nan_locs.any())
 def test_exact_ties(self):
     losses = self.losses_df.iloc[:, :20].copy()
     tied_mean = losses.mean().median()
     losses.iloc[:, 10:] -= losses.iloc[:, 10:].mean()
     losses.iloc[:, 10:] += tied_mean
     mcs = MCS(losses, 0.05, reps=200)
     mcs.seed(23456)
     mcs.compute()
 def test_str_repr(self):
     mcs = MCS(self.losses, 0.05)
     expected = 'MCS(size: 0.05, bootstrap: ' + str(mcs.bootstrap) + ')'
     assert_equal(str(mcs), expected)
     expected = expected[:-1] + ', ID: ' + hex(id(mcs)) + ')'
     assert_equal(mcs.__repr__(), expected)
     expected = ('<strong>MCS</strong>(' + '<strong>size</strong>: 0.05, ' +
                 '<strong>bootstrap</strong>: ' + str(mcs.bootstrap) +
                 ', ' + '<strong>ID</strong>: ' + hex(id(mcs)) + ')')
     assert_equal(mcs._repr_html_(), expected)
 def test_str_repr(self):
     mcs = MCS(self.losses, 0.05)
     expected = 'MCS(size: 0.05, bootstrap: ' + str(mcs.bootstrap) + ')'
     assert_equal(str(mcs), expected)
     expected = expected[:-1] + ', ID: ' + hex(id(mcs)) + ')'
     assert_equal(mcs.__repr__(), expected)
     expected = ('<strong>MCS</strong>(' +
                 '<strong>size</strong>: 0.05, ' +
                 '<strong>bootstrap</strong>: ' + str(mcs.bootstrap) + ', ' +
                 '<strong>ID</strong>: ' + hex(id(mcs)) + ')')
     assert_equal(mcs._repr_html_(), expected)
    def test_r_method(self):
        def r_step(losses, indices):
            # A basic but direct implementation of the r method
            t, k = losses.shape
            b = len(indices)
            mean_diffs = losses.mean(0)
            loss_diffs = np.zeros((k, k))
            variances = np.zeros((k, k))
            bs_diffs = np.zeros(b)
            stat_candidates = []
            for i in range(k):
                for j in range(i, k):
                    if i == j:
                        variances[i, i] = 1.0
                        loss_diffs[i, j] = 0.0
                        continue
                    loss_diffs_vec = losses[:, i] - losses[:, j]
                    loss_diffs_vec = loss_diffs_vec - loss_diffs_vec.mean()
                    loss_diffs[i, j] = mean_diffs[i] - mean_diffs[j]
                    loss_diffs[j, i] = mean_diffs[j] - mean_diffs[i]
                    for n in range(b):
                        # Compute bootstraped versions
                        bs_diffs[n] = loss_diffs_vec[indices[n]].mean()
                    variances[j, i] = variances[i, j] = (bs_diffs ** 2).mean()
                    stat_candidates.append(np.abs(bs_diffs) / np.sqrt(variances[i, j]))
            stat_candidates = np.array(stat_candidates).T
            stat_distn = np.max(stat_candidates, 1)
            std_loss_diffs = loss_diffs / np.sqrt(variances)
            stat = np.max(std_loss_diffs)
            pval = np.mean(stat <= stat_distn)
            loc = np.argwhere(std_loss_diffs == stat)
            drop_index = loc.flat[0]
            return pval, drop_index

        losses = self.losses[:, :10]  # Limit size
        mcs = MCS(losses, 0.05, reps=200)
        mcs.seed(23456)
        mcs.compute()
        m = 5  # Number of direct
        pvals = np.zeros(m) * np.nan
        indices = np.zeros(m) * np.nan
        for i in range(m):
            removed = list(indices[np.isfinite(indices)])
            include = list(set(list(range(10))).difference(removed))
            include.sort()
            pval, drop_index = r_step(losses[:, np.array(include)], mcs._bootsrap_indices)
            pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval])
            indices[i] = include[drop_index]
        direct = pd.DataFrame(pvals,
                              index=np.array(indices,dtype=np.int64),
                              columns=['Pvalue'])
        direct.index.name = 'Model index'
        assert_frame_equal(mcs.pvalues.iloc[:m], direct)
 def test_smoke(self):
     mcs = MCS(self.losses, 0.05, reps=100, block_size=10, method='max')
     mcs.compute()
     mcs = MCS(self.losses_df, 0.05, reps=100, block_size=10, method='r')
     mcs.compute()
     assert_equal(type(mcs.included), list)
     assert_equal(type(mcs.excluded), list)
     assert isinstance(mcs.pvalues, pd.DataFrame)
 def test_errors(self):
     with pytest.raises(ValueError):
         MCS(self.losses[:, 1], 0.05)
     mcs = MCS(self.losses,
               0.05,
               reps=100,
               block_size=10,
               method='max',
               bootstrap='circular')
     mcs.compute()
     mcs = MCS(self.losses,
               0.05,
               reps=100,
               block_size=10,
               method='max',
               bootstrap='moving block')
     mcs.compute()
     with pytest.raises(ValueError):
         MCS(self.losses, 0.05, bootstrap='unknown')
    def test_max_method(self):
        def max_step(losses, indices):
            # A basic but direct implementation of the max method
            t, k = losses.shape
            b = len(indices)
            loss_errors = losses - losses.mean(0)
            stats = np.zeros((b, k))
            for n in range(b):
                # Compute bootstraped versions
                bs_loss_errors = loss_errors[indices[n]]
                stats[n] = bs_loss_errors.mean(0) - bs_loss_errors.mean()
            variances = (stats ** 2).mean(0)
            std_devs = np.sqrt(variances)
            stat_dist = np.max(stats / std_devs, 1)

            test_stat = (losses.mean(0) - losses.mean())
            std_test_stat = test_stat / std_devs
            test_stat = np.max(std_test_stat)
            pval = (test_stat < stat_dist).mean()
            drop_index = np.argwhere(std_test_stat == test_stat).squeeze()
            return pval, drop_index, std_devs

        losses = self.losses[:, :10]  # Limit size
        mcs = MCS(losses, 0.05, reps=200, method='max')
        mcs.seed(23456)
        mcs.compute()
        m = 8  # Number of direct
        pvals = np.zeros(m) * np.nan
        indices = np.zeros(m) * np.nan
        for i in range(m):
            removed = list(indices[np.isfinite(indices)])
            include = list(set(list(range(10))).difference(removed))
            include.sort()
            pval, drop_index, std_devs = max_step(losses[:, np.array(include)], mcs._bootsrap_indices)
            pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval])
            indices[i] = include[drop_index]
        direct = pd.DataFrame(pvals,
                              index=np.array(indices,dtype=np.int64),
                              columns=['Pvalue'])
        direct.index.name = 'Model index'
        assert_frame_equal(mcs.pvalues.iloc[:m], direct)
 def test_errors(self):
     assert_raises(ValueError, MCS, self.losses[:,1], 0.05)
     mcs = MCS(self.losses, 0.05, reps=100, block_size=10, method='max', bootstrap='circular')
     mcs.compute()
     mcs = MCS(self.losses, 0.05, reps=100, block_size=10, method='max', bootstrap='moving block')
     mcs.compute()
     assert_raises(ValueError, MCS, self.losses, 0.05, bootstrap='unknown')
 def test_all_models_have_pval(self):
     losses = self.losses_df.iloc[:, :20]
     mcs = MCS(losses, 0.05, reps=200)
     mcs.seed(23456)
     mcs.compute()
     nan_locs = np.isnan(mcs.pvalues.iloc[:, 0])
     assert not nan_locs.any()
 def test_smoke(self):
     mcs = MCS(self.losses, 0.05, reps=100, block_size=10, method='max')
     mcs.compute()
     mcs = MCS(self.losses_df, 0.05, reps=100, block_size=10, method='r')
     mcs.compute()
     assert_equal(type(mcs.included), list)
     assert_equal(type(mcs.excluded), list)
     assert_true(isinstance(mcs.pvalues, pd.DataFrame))
 def test_exact_ties(self):
     losses = self.losses_df.iloc[:, :20].copy()
     tied_mean = losses.mean().median()
     losses.iloc[:, 10:] -= losses.iloc[:, 10:].mean()
     losses.iloc[:, 10:] += tied_mean
     mcs = MCS(losses, 0.05, reps=200)
     mcs.seed(23456)
     mcs.compute()
    def test_r_method(self):
        def r_step(losses, indices):
            # A basic but direct implementation of the r method
            t, k = losses.shape
            b = len(indices)
            mean_diffs = losses.mean(0)
            loss_diffs = np.zeros((k, k))
            variances = np.zeros((k, k))
            bs_diffs = np.zeros(b)
            stat_candidates = []
            for i in range(k):
                for j in range(i, k):
                    if i == j:
                        variances[i, i] = 1.0
                        loss_diffs[i, j] = 0.0
                        continue
                    loss_diffs_vec = losses[:, i] - losses[:, j]
                    loss_diffs_vec = loss_diffs_vec - loss_diffs_vec.mean()
                    loss_diffs[i, j] = mean_diffs[i] - mean_diffs[j]
                    loss_diffs[j, i] = mean_diffs[j] - mean_diffs[i]
                    for n in range(b):
                        # Compute bootstraped versions
                        bs_diffs[n] = loss_diffs_vec[indices[n]].mean()
                    variances[j, i] = variances[i, j] = (bs_diffs**2).mean()
                    stat_candidates.append(
                        np.abs(bs_diffs) / np.sqrt(variances[i, j]))
            stat_candidates = np.array(stat_candidates).T
            stat_distn = np.max(stat_candidates, 1)
            std_loss_diffs = loss_diffs / np.sqrt(variances)
            stat = np.max(std_loss_diffs)
            pval = np.mean(stat <= stat_distn)
            loc = np.argwhere(std_loss_diffs == stat)
            drop_index = loc.flat[0]
            return pval, drop_index

        losses = self.losses[:, :10]  # Limit size
        mcs = MCS(losses, 0.05, reps=200)
        mcs.seed(23456)
        mcs.compute()
        m = 5  # Number of direct
        pvals = np.zeros(m) * np.nan
        indices = np.zeros(m) * np.nan
        for i in range(m):
            removed = list(indices[np.isfinite(indices)])
            include = list(set(list(range(10))).difference(removed))
            include.sort()
            pval, drop_index = r_step(losses[:, np.array(include)],
                                      mcs._bootsrap_indices)
            pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval])
            indices[i] = include[drop_index]
        direct = pd.DataFrame(pvals,
                              index=np.array(indices, dtype=np.int64),
                              columns=['Pvalue'])
        direct.index.name = 'Model index'
        assert_frame_equal(mcs.pvalues.iloc[:m], direct)
    def test_max_method(self):
        def max_step(losses, indices):
            # A basic but direct implementation of the max method
            t, k = losses.shape
            b = len(indices)
            loss_errors = losses - losses.mean(0)
            stats = np.zeros((b, k))
            for n in range(b):
                # Compute bootstraped versions
                bs_loss_errors = loss_errors[indices[n]]
                stats[n] = bs_loss_errors.mean(0) - bs_loss_errors.mean()
            variances = (stats**2).mean(0)
            std_devs = np.sqrt(variances)
            stat_dist = np.max(stats / std_devs, 1)

            test_stat = (losses.mean(0) - losses.mean())
            std_test_stat = test_stat / std_devs
            test_stat = np.max(std_test_stat)
            pval = (test_stat < stat_dist).mean()
            drop_index = np.argwhere(std_test_stat == test_stat).squeeze()
            return pval, drop_index, std_devs

        losses = self.losses[:, :10]  # Limit size
        mcs = MCS(losses, 0.05, reps=200, method='max')
        mcs.seed(23456)
        mcs.compute()
        m = 8  # Number of direct
        pvals = np.zeros(m) * np.nan
        indices = np.zeros(m) * np.nan
        for i in range(m):
            removed = list(indices[np.isfinite(indices)])
            include = list(set(list(range(10))).difference(removed))
            include.sort()
            pval, drop_index, std_devs = max_step(losses[:,
                                                         np.array(include)],
                                                  mcs._bootsrap_indices)
            pvals[i] = pval if i == 0 else np.max([pvals[i - 1], pval])
            indices[i] = include[drop_index]
        direct = pd.DataFrame(pvals,
                              index=np.array(indices, dtype=np.int64),
                              columns=['Pvalue'])
        direct.index.name = 'Model index'
        assert_frame_equal(mcs.pvalues.iloc[:m], direct)