def test_split_classification_many_imbalanced_classes(self): for i in range(10): X = np.array([range(20), range(20)]).transpose() y = np.array((0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5)) np.random.shuffle(y) X_train, X_valid, Y_train, Y_valid = split_data( X, y, classification=True) print(X_train, Y_train) self.assertLessEqual(max(Y_valid), 1)
def _split_regular(self): X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([0, 0, 0, 1, 1, 2]) X_train, X_valid, Y_train, Y_valid = split_data(X, y) # Check shapes self.assertEqual(X_train.shape, (4, 2)) self.assertEqual(Y_train.shape, (4, )) self.assertEqual(X_valid.shape, (2, 2)) self.assertEqual(Y_valid.shape, (2, )) self.assertListEqual(list(Y_valid), [0, 0]) self.assertListEqual(list(Y_train), [2, 0, 1, 1])
def _stratify(self): X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([0, 0, 0, 0, 1, 1]) X_train, X_valid, Y_train, Y_valid = split_data(X, y) # Check shapes self.assertEqual(X_train.shape[0], 4) self.assertEqual(X_train.shape[1], 2) self.assertEqual(Y_train.shape[0], 4) self.assertEqual(X_valid.shape[0], 2) self.assertEqual(X_valid.shape[1], 2) self.assertEqual(Y_valid.shape[0], 2) self.assertListEqual(list(Y_valid), [1, 0]) self.assertListEqual(list(Y_train), [0, 0, 0, 1])
def test_split_data_regression(self): n_points = 1000 np.random.seed(42) n_dims = np.random.randint(1, 100) X = np.random.rand(n_points, n_dims) y = np.random.rand(n_points) X_train, X_valid, Y_train, Y_valid = split_data(X, y) self.assertEqual(X_train.shape[0], 670) self.assertEqual(X_valid.shape[0], 330) self.assertEqual(Y_train.shape[0], 670) self.assertEqual(Y_valid.shape[0], 330) self.assertEqual(X_train.shape[1], n_dims) self.assertEqual(X_valid.shape[1], n_dims) # Random checks self.assertAlmostEqual(X_train[4, 2], 0.5986584841970366) self.assertAlmostEqual(X_valid[4, 2], 0.63911512838980322)
def _save_ensemble_data(x_data, y_data, tmp_dir, watcher): """Split dataset and store Data for the ensemble script. :param x_data: :param y_data: :return: """ task_name = 'LoadData' watcher.start_task(task_name) _, _, _, y_ensemble = split_data(x_data, y_data) filepath = os.path.join(tmp_dir, 'true_labels_ensemble.npy') lock_path = filepath + '.lock' with lockfile.LockFile(lock_path): if not os.path.exists(filepath): np.save(filepath, y_ensemble) watcher.stop_task(task_name)
def __init__(self, data_manager, configuration, with_predictions=False, all_scoring_functions=False, seed=1, output_dir=None, output_y_test=False, num_run=None): super(HoldoutEvaluator, self).__init__( data_manager, configuration, with_predictions=with_predictions, all_scoring_functions=all_scoring_functions, seed=seed, output_dir=output_dir, output_y_test=output_y_test, num_run=num_run) classification = data_manager.info['task'] in CLASSIFICATION_TASKS self.X_train, self.X_optimization, self.Y_train, self.Y_optimization = \ split_data(data_manager.data['X_train'], data_manager.data['Y_train'], classification=classification) self.model = self.model_class(self.configuration, self.seed)