Esempio n. 1
0
def split_patients(patients, valid_percent, test_percent, rng=(2014, 10, 22)):
    if isinstance(rng, (list, tuple)):
        rng = make_np_rng(None, rng, which_method='uniform')

    vals = np.asarray(patients.values())
    keys = np.asarray(patients.keys())
    sss = StratifiedShuffleSplit(
        vals, n_iter=1, test_size=test_percent, random_state=rng)
    remaining_idx, test_idx = sss.__iter__().next()

    if valid_percent > 0:
        # Rate of samples required to build validation set
        valid_rate = valid_percent / (1 - test_percent)

        sss = StratifiedShuffleSplit(
            vals[remaining_idx], n_iter=1, test_size=valid_rate, random_state=rng)
        tr_idx, val_idx = sss.__iter__().next()
        valid_idx = remaining_idx[val_idx]
        train_idx = remaining_idx[tr_idx]
    else:
        train_idx = remaining_idx
        valid_idx = []

    train_patients = dict(zip(keys[train_idx], vals[train_idx]))
    valid_patients = dict(zip(keys[valid_idx], vals[valid_idx]))
    test_patients = dict(zip(keys[test_idx], vals[test_idx]))
    return train_patients, valid_patients, test_patients
Esempio n. 2
0
def simple_classification(n_samples=100, n_features=10, random_state=33):
    """
    Generate simple classification task for training.

    Parameters
    ----------
    n_samples : int
        Number of samples in dataset.
    n_features : int
        Number of features for each sample.
    random_state : int
        Random state to make results reproducible.

    Returns
    -------
    tuple
        Returns tuple that contains 4 variables. There are input train,
        input test, target train, target test respectevly.
    """
    X, y = datasets.make_classification(n_samples=n_samples,
                                        n_features=n_features,
                                        random_state=random_state)
    shuffle_split = StratifiedShuffleSplit(y, 1, train_size=0.6,
                                           random_state=random_state)

    train_index, test_index = next(shuffle_split.__iter__())
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    return x_train, x_test, y_train, y_test
Esempio n. 3
0
def simple_classification(n_samples=100, n_features=10, random_state=33):
    """ Generate simple classification task for training.

    Parameters
    ----------
    n_samples : int
        Number of samples in dataset.
    n_features : int
        Number of features for each sample.
    random_state : int
        Random state to make results reproducible.

    Returns
    -------
    tuple
        Returns tuple that contains 4 variables. There are input train,
        input test, target train, target test respectevly.
    """
    X, y = datasets.make_classification(n_samples=n_samples,
                                        n_features=n_features,
                                        random_state=random_state)
    shuffle_split = StratifiedShuffleSplit(y,
                                           1,
                                           train_size=0.6,
                                           random_state=random_state)

    train_index, test_index = next(shuffle_split.__iter__())
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    return x_train, x_test, y_train, y_test
Esempio n. 4
0
    def setUp(self):
        super(QuasiNewtonTestCase, self).setUp()

        X, y = datasets.make_classification(n_samples=100, n_features=10, random_state=33)
        shuffle_split = StratifiedShuffleSplit(y, 1, train_size=0.6, random_state=33)

        train_index, test_index = next(shuffle_split.__iter__())
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        self.X, self.y = X, y
        self.data = (x_train, x_test, y_train, y_test)
Esempio n. 5
0
def get_rows_msr(data):
    conf = get_config()
    rng = get_rng()
    train = [data[1][idx] for idx in data[3]]
    test = [data[1][idx] for idx in data[2]]
    # test = [data[1][idx] for idx in data[4]]
    shuffle(train, rng.rand)
    train_y = [y for y, o, p in train]
    # build dev set
    sss = StratifiedShuffleSplit(
        train_y, 1, train_size=0.8, test_size=0.2, random_state=rng)
    train_index, dev_index = sss.__iter__().next()
    return [train[i] for i in train_index], [train[i] for i in dev_index], test
Esempio n. 6
0
    def setUp(self):
        super(QuasiNewtonTestCase, self).setUp()

        X, y = datasets.make_classification(n_samples=100,
                                            n_features=10,
                                            random_state=33)
        shuffle_split = StratifiedShuffleSplit(y,
                                               1,
                                               train_size=0.6,
                                               random_state=33)

        train_index, test_index = next(shuffle_split.__iter__())
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        self.X, self.y = X, y
        self.data = (x_train, x_test, y_train, y_test)
		dataset = task.get_dataset()
		# Impute the values - While values would be imputed when calculating some meta-features anyway, this gives more control.
		X, y, categorical = dataset.get_data(target = task.target_feature, return_categorical_indicator = True)

		#X, categorical = remove_zero_columns(impute_values(X, categorical), categorical)

		# Subsample landmarker need folds, the train+test set of subsample landmarkers should be 500 instances,
		# since that is the size of our smallest dataset.
		# We first create a fold for 500 stratified samples, and then again divide that selection to 10 folds.
		max_size = 500
		number_of_classes = len(np.unique(y))
		if y.shape[0] < (max_size + number_of_classes):
			subset_indices = np.arange(max_size)
		else:
			subset_split = StratifiedShuffleSplit(y, n_iter=1, test_size=500, random_state = 0)
			_, subset_indices = next(subset_split.__iter__())
		mapped_folds = StratifiedShuffleSplit(y[subset_indices], n_iter=10, test_size=0.2, random_state = 0)

		subsample_folds = [(subset_indices[train],subset_indices[test]) for train, test in mapped_folds]

		# Because the subsamples are of constant size, always 500, we just calculate them once per dataset,
		# not once for every subsample of every dataset (those are stratified anyway)
		log("subsample-mf")
		subsample_features = subsample_metafeatures(X, y, categorical, subsample_folds)

		# We also take subsets of the original dataset, because it creates a bigger metadataset to learn from
		for i in np.arange(0.1, 1.01, 0.1):

			# We want a minimum size of 500, otherwise predicting runtime is not that useful anyway,
			# and it avoids some issues with train/test splits being too small and timing not being accurately measured
			if(int(i*len(y)) >= 500):