def load_semi_supervised(n_labeled=100, cut_off=1000, seed=123456, conv=False, extra=False): """ Load the SVHN dataset where only a fraction of data points are labeled. The amount of labeled data will be evenly distributed accross classes. :param n_labeled: Number of labeled data points. :param cut_off: A cut off constant so that the data set is divisable. :param seed: The seed for the pseudo random shuffle of data points. :param conv: Boolean whether the images should be vectorized or not. :param extra: Include the extra set or not. :return: Train set unlabeled and labeled, test set, validation set. """ rng = np.random.RandomState(seed=seed) train_set, test_set, valid_set = _download(extra) # Create the labeled and unlabeled data evenly distributed across classes. x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng) train_set = (x_u, y_u) train_set_labeled = (x_l, y_l) train_x, train_t = train_set # shuffle data train_collect = np.append(train_x, train_t, axis=1) rng.shuffle(train_collect) train_set = (train_collect[:, :-10], train_collect[:, -10:]) train_set = cut_off_dataset(train_set, cut_off, rng) test_set = pad_targets(test_set) if valid_set is not None: valid_set = pad_targets(valid_set) if conv: train_set = _gen_conv(train_set) test_set = _gen_conv(test_set) if valid_set is not None: valid_set = _gen_conv(valid_set) return train_set, train_set_labeled, test_set, valid_set
def load_semi_supervised(n_labeled=1000, cut_off=1000, seed=123456, conv=False, extra=False): """ Load the SVHN dataset where only a fraction of data points are labeled. The amount of labeled data will be evenly distributed accross classes. :param n_labeled: Number of labeled data points. :param cut_off: A cut off constant so that the data set is divisable. :param seed: The seed for the pseudo random shuffle of data points. :param conv: Boolean whether the images should be vectorized or not. :param extra: Include the extra set or not. :return: Train set unlabeled and labeled, test set, validation set. """ rng = np.random.RandomState(seed=seed) train_set, test_set, valid_set = _download(extra) # Create the labeled and unlabeled data evenly distributed across classes. x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng) train_set = (x_u, y_u) train_set_labeled = (x_l, y_l) train_x, train_t = train_set # shuffle data train_collect = np.append(train_x, train_t, axis=1) rng.shuffle(train_collect) train_set = (train_collect[:, :-10], train_collect[:, -10:]) train_set = cut_off_dataset(train_set, cut_off, rng) test_set = pad_targets(test_set) if valid_set is not None: valid_set = pad_targets(valid_set) if conv: train_set = _gen_conv(train_set) test_set = _gen_conv(test_set) if valid_set is not None: valid_set = _gen_conv(valid_set) return train_set, train_set_labeled, test_set, valid_set
def load_semi_supervised(n_labeled=100, cut_off=100, seed=123456, expand_channels=False, remove_channels=False): """ Load the NORB dataset where only a fraction of data points are labeled. The amount of labeled data will be evenly distributed accross classes. :param n_labeled: Number of labeled data points. :param cut_off: A cut off constant so that the data set is divisable. :param seed: The seed for the pseudo random shuffle of data points. :param expand_channels: The pairwise images are rolled out to one dataset. :param remove_channels: The second image in each pair is removed. :return: Train set unlabeled and labeled, test set, validation set. """ rng = np.random.RandomState(seed=seed) train_set, test_set, valid_set = _download() # Create the labeled and unlabeled data evenly distributed across classes. x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng) if expand_channels: x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.)) x_l_1 = x_l[:, 0, :] x_l_2 = x_l[:, 1, :] x_l = np.append(x_l_1, x_l_2, axis=0) y_l = np.append(y_l, y_l, axis=0) x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.)) x_u_1 = x_u[:, 0, :] x_u_2 = x_u[:, 1, :] x_u = np.append(x_u_1, x_u_2, axis=0) y_u = np.append(y_u, y_u, axis=0) test_x, test_t = test_set test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.)) test_x = np.append(test_x[:, 0, :], test_x[:, 1, :], axis=0) test_t = np.append(test_t, test_t, axis=0) test_set = (test_x, test_t) valid_x, valid_t = valid_set valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.)) valid_x = np.append(valid_x[:, 0, :], valid_x[:, 1, :], axis=0) valid_t = np.append(valid_t, valid_t, axis=0) valid_set = (valid_x, valid_t) elif remove_channels: x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.)) x_l = x_l[:, 0, :].reshape((-1, x_l.shape[-1])) x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.)) x_u = x_u[:, 0, :].reshape((-1, x_u.shape[-1])) test_x, test_t = test_set test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.)) test_x = test_x[:, 0, :].reshape((-1, test_x.shape[-1])) test_set = (test_x, test_t) valid_x, valid_t = valid_set valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.)) valid_x = valid_x[:, 0, :].reshape((-1, valid_x.shape[-1])) valid_set = (valid_x, valid_t) train_set = (x_u, y_u) train_set_labeled = (x_l, y_l) train_x, train_t = train_set # shuffle data train_collect = np.append(train_x, train_t, axis=1) rng.shuffle(train_collect) train_set = (train_collect[:, :-5], train_collect[:, -5:]) train_set = cut_off_dataset(train_set, cut_off, rng) test_set = pad_targets(test_set) if valid_set is not None: valid_set = pad_targets(valid_set) return train_set, train_set_labeled, test_set, valid_set