예제 #1
0
 def test_split_classification_many_imbalanced_classes(self):
     for i in range(10):
         X = np.array([range(20), range(20)]).transpose()
         y = np.array((0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
                       4, 5))
         np.random.shuffle(y)
         X_train, X_valid, Y_train, Y_valid = split_data(
             X, y,
             classification=True)
         print(X_train, Y_train)
         self.assertLessEqual(max(Y_valid), 1)
예제 #2
0
    def _split_regular(self):
        X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
        y = np.array([0, 0, 0, 1, 1, 2])
        X_train, X_valid, Y_train, Y_valid = split_data(X, y)

        # Check shapes
        self.assertEqual(X_train.shape, (4, 2))
        self.assertEqual(Y_train.shape, (4, ))
        self.assertEqual(X_valid.shape, (2, 2))
        self.assertEqual(Y_valid.shape, (2, ))

        self.assertListEqual(list(Y_valid), [0, 0])
        self.assertListEqual(list(Y_train), [2, 0, 1, 1])
예제 #3
0
    def _stratify(self):
        X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
        y = np.array([0, 0, 0, 0, 1, 1])
        X_train, X_valid, Y_train, Y_valid = split_data(X, y)

        # Check shapes
        self.assertEqual(X_train.shape[0], 4)
        self.assertEqual(X_train.shape[1], 2)
        self.assertEqual(Y_train.shape[0], 4)

        self.assertEqual(X_valid.shape[0], 2)
        self.assertEqual(X_valid.shape[1], 2)
        self.assertEqual(Y_valid.shape[0], 2)

        self.assertListEqual(list(Y_valid), [1, 0])
        self.assertListEqual(list(Y_train), [0, 0, 0, 1])
예제 #4
0
    def test_split_data_regression(self):
        n_points = 1000
        np.random.seed(42)
        n_dims = np.random.randint(1, 100)
        X = np.random.rand(n_points, n_dims)
        y = np.random.rand(n_points)

        X_train, X_valid, Y_train, Y_valid = split_data(X, y)

        self.assertEqual(X_train.shape[0], 670)
        self.assertEqual(X_valid.shape[0], 330)
        self.assertEqual(Y_train.shape[0], 670)
        self.assertEqual(Y_valid.shape[0], 330)
        self.assertEqual(X_train.shape[1], n_dims)
        self.assertEqual(X_valid.shape[1], n_dims)

        # Random checks
        self.assertAlmostEqual(X_train[4, 2], 0.5986584841970366)
        self.assertAlmostEqual(X_valid[4, 2], 0.63911512838980322)
예제 #5
0
def _save_ensemble_data(x_data, y_data, tmp_dir, watcher):
    """Split dataset and store Data for the ensemble script.

    :param x_data:
    :param y_data:
    :return:

    """
    task_name = 'LoadData'
    watcher.start_task(task_name)
    _, _, _, y_ensemble = split_data(x_data, y_data)

    filepath = os.path.join(tmp_dir, 'true_labels_ensemble.npy')

    lock_path = filepath + '.lock'
    with lockfile.LockFile(lock_path):
        if not os.path.exists(filepath):
            np.save(filepath, y_ensemble)

    watcher.stop_task(task_name)
예제 #6
0
    def __init__(self, data_manager, configuration,
                 with_predictions=False,
                 all_scoring_functions=False,
                 seed=1,
                 output_dir=None,
                 output_y_test=False,
                 num_run=None):
        super(HoldoutEvaluator, self).__init__(
            data_manager, configuration,
            with_predictions=with_predictions,
            all_scoring_functions=all_scoring_functions,
            seed=seed,
            output_dir=output_dir,
            output_y_test=output_y_test,
            num_run=num_run)

        classification = data_manager.info['task'] in CLASSIFICATION_TASKS
        self.X_train, self.X_optimization, self.Y_train, self.Y_optimization = \
            split_data(data_manager.data['X_train'],
                       data_manager.data['Y_train'],
                       classification=classification)

        self.model = self.model_class(self.configuration, self.seed)