def test_balanced_subsets_basic(self): """Test balanced_subsets with basic params and no shuffling """ df = pd.DataFrame([ [1, 11, 1], [2, 22, 1], [3, 33, 0], [4, 44, 0], [5, 55, 0], [6, 66, 0]], columns=['a', 'b', 'y']) ss = munge.balanced_subsets(df[['a', 'b']], df.y, shuffle=False) self.assertEqual(len(ss), 2) X1, y1 = ss[0] X2, y2 = ss[1] # Check equal number of labels per subset self.assertTrue((y1 == 1).sum() == (y1 == 0).sum() == 2) self.assertTrue((y2 == 1).sum() == (y2 == 0).sum() == 2) # Check unshuffled results self.assertEqual(set(X1.a.unique()), {1, 2, 3, 4}) self.assertEqual(set(X2.a.unique()), {1, 2, 5, 6}) # Test with subsample ss = munge.balanced_subsets( df[['a', 'b']], df.y, subsample=0.5, shuffle=False) self.assertEqual(len(ss), 4) X1, y1 = ss[0] X3, y3 = ss[2] self.assertTrue(X1.shape[0] == X3.shape[0] == 2)
def test_balanced_subsets_advanced(self): """Test balanced_subsets with advanced params """ df = pd.DataFrame([ [1, 11, 'true'], [2, 22, 'true'], [3, 33, 'false'], [4, 44, 'false'], [5, 55, 'false'], [6, 66, 'false'], [7, 77, 'false']], columns=['a', 'b', 'y']) ss = munge.balanced_subsets( df[['a', 'b']], df.y, labels=["true", "false"]) self.assertEqual(len(ss), 2) X1, y1 = ss[0] X2, y2 = ss[1] self.assertEqual(X1.shape[0], 4) self.assertEqual(X2.shape[0], 5) self.assertTrue((y1 == 'true').sum() == (y1 == 'false').sum() == 2) self.assertEqual((y2 == 'true').sum(), 2) self.assertEqual((y2 == 'false').sum(), 3)