def load_semi_supervised(n_labeled=100, max_parse_size=1012, seed=123456): """ Load the AG News dataset where only a fraction of data points are labeled. The amount of labeled data will be evenly distributed accross classes. :param n_labeled: Number of labeled data points. :param max_parse_size: The fixed length of a string. The largest needed is 1012. :param seed: The seed for the pseudo random shuffle of data points. :return: Train set unlabeled and labeled, test set, validation set. """ train_set, test_set, valid_set = _load(max_parse_size) rng = np.random.RandomState(seed=seed) n_classes = train_set[1].max() + 1 print n_classes # Create the labeled and unlabeled data evenly distributed across classes. x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng) train_set = (x_u, y_u) train_set_labeled = (x_l, y_l) # shuffle data train_x, train_t = train_set train_collect = np.append(train_x, train_t, axis=1) rng.shuffle(train_collect) train_set = (train_collect[:, :-n_classes], train_collect[:, -n_classes:]) test_set = pad_targets(test_set) if valid_set is not None: valid_set = pad_targets(valid_set) return train_set, train_set_labeled, test_set, valid_set
def load_semi_supervised(): """ Load the half moon dataset with 6 fixed labeled data points. """ train_set, test_set, valid_set = _download() # Add 6 static labels. train_x_l = np.zeros((6, 2)) train_t_l = np.array([0, 0, 0, 1, 1, 1]) # Top halfmoon train_x_l[0] = [.7, 1.7] # left train_x_l[1] = [1.6, 2.6] # middle train_x_l[2] = [2.7, 1.7] # right # Bottom halfmoon train_x_l[3] = [1.6, 2.0] # left train_x_l[4] = [2.7, 1.1] # middle train_x_l[5] = [3.5, 2.0] # right train_set_labeled = (train_x_l, train_t_l) train_set_labeled = pad_targets(train_set_labeled) train_set = pad_targets(train_set) test_set = pad_targets(test_set) if valid_set is not None: valid_set = pad_targets(valid_set) return train_set, train_set_labeled, test_set, valid_set
def load_supervised(filter_std=0.1, train_valid_combine=False): """ Load the MNIST dataset. :param filter_std: The standard deviation threshold for keeping features. :param train_valid_combine: If the train set and validation set should be combined. :return: The train, test and validation sets. """ train_set, test_set, valid_set = _download() if train_valid_combine: train_set = np.append(train_set[0], valid_set[0], axis=0), np.append(train_set[1], valid_set[1], axis=0) # Filter out the features with a low standard deviation. if filter_std > .0: train_x, train_t = train_set idx_keep = np.std(train_x, axis=0) > filter_std train_x = train_x[:, idx_keep] valid_set = (valid_set[0][:, idx_keep], valid_set[1]) test_set = (test_set[0][:, idx_keep], test_set[1]) train_set = (train_x, train_t) test_set = pad_targets(test_set) valid_set = pad_targets(valid_set) train_set = pad_targets(train_set) return train_set, test_set, valid_set
def load_supervised(max_parse_size=1014): """ Load the AG News dataset. :param max_parse_size: The fixed length of a string. The largest needed is 1012. :return: The train, test and validation sets. """ train_set, test_set, valid_set = _load(max_parse_size) test_set = pad_targets(test_set) valid_set = pad_targets(valid_set) train_set = pad_targets(train_set) return train_set, test_set, valid_set
def load_semi_supervised(n_labeled=100, filter_std=0.1, seed=123456, train_valid_combine=False): """ Load the MNIST dataset where only a fraction of data points are labeled. The amount of labeled data will be evenly distributed accross classes. :param n_labeled: Number of labeled data points. :param filter_std: The standard deviation threshold for keeping features. :param seed: The seed for the pseudo random shuffle of data points. :param train_valid_combine: If the train set and validation set should be combined. :return: Train set unlabeled and labeled, test set, validation set. """ # train_set, test_set, valid_set = _download() # train_set, test_set, _ = _download() # # Combine the train set and validation set. # if train_valid_combine: # train_set = np.append(train_set[0], valid_set[0], axis=0), np.append(train_set[1], valid_set[1], axis=0) train_set, valid_set, test_set = data_generator() print 'data_ready' rng = np.random.RandomState(seed=seed) # Create the labeled and unlabeled data evenly distributed across classes. x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng) # Filter out the features with a low standard deviation. if filter_std > .0: idx_keep = np.std(x_u, axis=0) > filter_std x_l, x_u = x_l[:, idx_keep], x_u[:, idx_keep] valid_set = (valid_set[0][:, idx_keep], valid_set[1]) test_set = (test_set[0][:, idx_keep], test_set[1]) train_set = (x_u, y_u) train_set_labeled = (x_l, y_l) # shuffle data train_x, train_t = train_set train_collect = np.append(train_x, train_t, axis=1) rng.shuffle(train_collect) train_set = (train_collect[:, :-10], train_collect[:, -10:]) test_set = pad_targets(test_set) if valid_set is not None: valid_set = pad_targets(valid_set) return train_set, train_set_labeled, test_set, valid_set
def load_semi_supervised(n_labeled=100, cut_off=1000, seed=123456, conv=False, extra=False): """ Load the SVHN dataset where only a fraction of data points are labeled. The amount of labeled data will be evenly distributed accross classes. :param n_labeled: Number of labeled data points. :param cut_off: A cut off constant so that the data set is divisable. :param seed: The seed for the pseudo random shuffle of data points. :param conv: Boolean whether the images should be vectorized or not. :param extra: Include the extra set or not. :return: Train set unlabeled and labeled, test set, validation set. """ rng = np.random.RandomState(seed=seed) train_set, test_set, valid_set = _download(extra) # Create the labeled and unlabeled data evenly distributed across classes. x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng) train_set = (x_u, y_u) train_set_labeled = (x_l, y_l) train_x, train_t = train_set # shuffle data train_collect = np.append(train_x, train_t, axis=1) rng.shuffle(train_collect) train_set = (train_collect[:, :-10], train_collect[:, -10:]) train_set = cut_off_dataset(train_set, cut_off, rng) test_set = pad_targets(test_set) if valid_set is not None: valid_set = pad_targets(valid_set) if conv: train_set = _gen_conv(train_set) test_set = _gen_conv(test_set) if valid_set is not None: valid_set = _gen_conv(valid_set) return train_set, train_set_labeled, test_set, valid_set
def load_semi_supervised(n_labeled=100, filter_std=0.1, seed=123456, train_valid_combine=False): """ Load the MNIST dataset where only a fraction of data points are labeled. The amount of labeled data will be evenly distributed accross classes. :param n_labeled: Number of labeled data points. :param filter_std: The standard deviation threshold for keeping features. :param seed: The seed for the pseudo random shuffle of data points. :param train_valid_combine: If the train set and validation set should be combined. :return: Train set unlabeled and labeled, test set, validation set. """ train_set, test_set, valid_set = _download() # Combine the train set and validation set. if train_valid_combine: train_set = np.append(train_set[0], valid_set[0], axis=0), np.append(train_set[1], valid_set[1], axis=0) rng = np.random.RandomState(seed=seed) # Create the labeled and unlabeled data evenly distributed across classes. x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng) # Filter out the features with a low standard deviation. if filter_std > .0: idx_keep = np.std(x_u, axis=0) > filter_std x_l, x_u = x_l[:, idx_keep], x_u[:, idx_keep] valid_set = (valid_set[0][:, idx_keep], valid_set[1]) test_set = (test_set[0][:, idx_keep], test_set[1]) train_set = (x_u, y_u) train_set_labeled = (x_l, y_l) # shuffle data train_x, train_t = train_set train_collect = np.append(train_x, train_t, axis=1) rng.shuffle(train_collect) train_set = (train_collect[:, :-10], train_collect[:, -10:]) test_set = pad_targets(test_set) if valid_set is not None: valid_set = pad_targets(valid_set) return train_set, train_set_labeled, test_set, valid_set
def load_supervised(conv=False, extra=False, normalize=True): """ Load the SVHN dataset. :param conv: Boolean whether the images should be vectorized or not. :param extra: Include the extra set or not. :param normalize: Boolean normalize the data set. :return: The train, test and validation sets. """ train_set, test_set, valid_set = _download(extra, normalize) test_set = pad_targets(test_set) train_set = pad_targets(train_set) if valid_set is not None: valid_set = pad_targets(valid_set) if conv: test_set = _gen_conv(test_set) train_set = _gen_conv(train_set) if valid_set is not None: valid_set = _gen_conv(valid_set) return train_set, test_set, valid_set
def load_semi_supervised(n_labeled=1000, cut_off=1000, seed=123456, conv=False, extra=False): """ Load the SVHN dataset where only a fraction of data points are labeled. The amount of labeled data will be evenly distributed accross classes. :param n_labeled: Number of labeled data points. :param cut_off: A cut off constant so that the data set is divisable. :param seed: The seed for the pseudo random shuffle of data points. :param conv: Boolean whether the images should be vectorized or not. :param extra: Include the extra set or not. :return: Train set unlabeled and labeled, test set, validation set. """ rng = np.random.RandomState(seed=seed) train_set, test_set, valid_set = _download(extra) # Create the labeled and unlabeled data evenly distributed across classes. x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng) train_set = (x_u, y_u) train_set_labeled = (x_l, y_l) train_x, train_t = train_set # shuffle data train_collect = np.append(train_x, train_t, axis=1) rng.shuffle(train_collect) train_set = (train_collect[:, :-10], train_collect[:, -10:]) train_set = cut_off_dataset(train_set, cut_off, rng) test_set = pad_targets(test_set) if valid_set is not None: valid_set = pad_targets(valid_set) if conv: train_set = _gen_conv(train_set) test_set = _gen_conv(test_set) if valid_set is not None: valid_set = _gen_conv(valid_set) return train_set, train_set_labeled, test_set, valid_set
def load_semi_supervised(n_labeled=100, cut_off=100, seed=123456, expand_channels=False, remove_channels=False): """ Load the NORB dataset where only a fraction of data points are labeled. The amount of labeled data will be evenly distributed accross classes. :param n_labeled: Number of labeled data points. :param cut_off: A cut off constant so that the data set is divisable. :param seed: The seed for the pseudo random shuffle of data points. :param expand_channels: The pairwise images are rolled out to one dataset. :param remove_channels: The second image in each pair is removed. :return: Train set unlabeled and labeled, test set, validation set. """ rng = np.random.RandomState(seed=seed) train_set, test_set, valid_set = _download() # Create the labeled and unlabeled data evenly distributed across classes. x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng) if expand_channels: x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.)) x_l_1 = x_l[:, 0, :] x_l_2 = x_l[:, 1, :] x_l = np.append(x_l_1, x_l_2, axis=0) y_l = np.append(y_l, y_l, axis=0) x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.)) x_u_1 = x_u[:, 0, :] x_u_2 = x_u[:, 1, :] x_u = np.append(x_u_1, x_u_2, axis=0) y_u = np.append(y_u, y_u, axis=0) test_x, test_t = test_set test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.)) test_x = np.append(test_x[:, 0, :], test_x[:, 1, :], axis=0) test_t = np.append(test_t, test_t, axis=0) test_set = (test_x, test_t) valid_x, valid_t = valid_set valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.)) valid_x = np.append(valid_x[:, 0, :], valid_x[:, 1, :], axis=0) valid_t = np.append(valid_t, valid_t, axis=0) valid_set = (valid_x, valid_t) elif remove_channels: x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.)) x_l = x_l[:, 0, :].reshape((-1, x_l.shape[-1])) x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.)) x_u = x_u[:, 0, :].reshape((-1, x_u.shape[-1])) test_x, test_t = test_set test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.)) test_x = test_x[:, 0, :].reshape((-1, test_x.shape[-1])) test_set = (test_x, test_t) valid_x, valid_t = valid_set valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.)) valid_x = valid_x[:, 0, :].reshape((-1, valid_x.shape[-1])) valid_set = (valid_x, valid_t) train_set = (x_u, y_u) train_set_labeled = (x_l, y_l) train_x, train_t = train_set # shuffle data train_collect = np.append(train_x, train_t, axis=1) rng.shuffle(train_collect) train_set = (train_collect[:, :-5], train_collect[:, -5:]) train_set = cut_off_dataset(train_set, cut_off, rng) test_set = pad_targets(test_set) if valid_set is not None: valid_set = pad_targets(valid_set) return train_set, train_set_labeled, test_set, valid_set