Esempio n. 1
0
    def test_balanced_subsets_basic(self):
        """Test balanced_subsets with basic params and no shuffling
        """
        df = pd.DataFrame([
            [1, 11, 1],
            [2, 22, 1],
            [3, 33, 0],
            [4, 44, 0],
            [5, 55, 0],
            [6, 66, 0]], columns=['a', 'b', 'y'])
        ss = munge.balanced_subsets(df[['a', 'b']], df.y, shuffle=False)

        self.assertEqual(len(ss), 2)

        X1, y1 = ss[0]
        X2, y2 = ss[1]

        # Check equal number of labels per subset
        self.assertTrue((y1 == 1).sum() == (y1 == 0).sum() == 2)
        self.assertTrue((y2 == 1).sum() == (y2 == 0).sum() == 2)

        # Check unshuffled results
        self.assertEqual(set(X1.a.unique()), {1, 2, 3, 4})
        self.assertEqual(set(X2.a.unique()), {1, 2, 5, 6})

        # Test with subsample
        ss = munge.balanced_subsets(
            df[['a', 'b']], df.y, subsample=0.5, shuffle=False)

        self.assertEqual(len(ss), 4)

        X1, y1 = ss[0]
        X3, y3 = ss[2]
        self.assertTrue(X1.shape[0] == X3.shape[0] == 2)
Esempio n. 2
0
    def test_balanced_subsets_advanced(self):
        """Test balanced_subsets with advanced params
        """
        df = pd.DataFrame([
            [1, 11, 'true'],
            [2, 22, 'true'],
            [3, 33, 'false'],
            [4, 44, 'false'],
            [5, 55, 'false'],
            [6, 66, 'false'],
            [7, 77, 'false']], columns=['a', 'b', 'y'])

        ss = munge.balanced_subsets(
            df[['a', 'b']], df.y, labels=["true", "false"])

        self.assertEqual(len(ss), 2)

        X1, y1 = ss[0]
        X2, y2 = ss[1]
        self.assertEqual(X1.shape[0], 4)
        self.assertEqual(X2.shape[0], 5)

        self.assertTrue((y1 == 'true').sum() == (y1 == 'false').sum() == 2)
        self.assertEqual((y2 == 'true').sum(), 2)
        self.assertEqual((y2 == 'false').sum(), 3)