Exemple #1
0
def load_semi_supervised(n_labeled=100,
                         cut_off=1000,
                         seed=123456,
                         conv=False,
                         extra=False):
    """
    Load the SVHN dataset where only a fraction of data points are labeled. The amount
    of labeled data will be evenly distributed accross classes.
    :param n_labeled: Number of labeled data points.
    :param cut_off: A cut off constant so that the data set is divisable.
    :param seed: The seed for the pseudo random shuffle of data points.
    :param conv: Boolean whether the images should be vectorized or not.
    :param extra: Include the extra set or not.
    :return: Train set unlabeled and labeled, test set, validation set.
    """

    rng = np.random.RandomState(seed=seed)
    train_set, test_set, valid_set = _download(extra)

    # Create the labeled and unlabeled data evenly distributed across classes.
    x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng)

    train_set = (x_u, y_u)
    train_set_labeled = (x_l, y_l)
    train_x, train_t = train_set

    # shuffle data
    train_collect = np.append(train_x, train_t, axis=1)
    rng.shuffle(train_collect)
    train_set = (train_collect[:, :-10], train_collect[:, -10:])

    train_set = cut_off_dataset(train_set, cut_off, rng)

    test_set = pad_targets(test_set)
    if valid_set is not None:
        valid_set = pad_targets(valid_set)

    if conv:
        train_set = _gen_conv(train_set)
        test_set = _gen_conv(test_set)
        if valid_set is not None:
            valid_set = _gen_conv(valid_set)

    return train_set, train_set_labeled, test_set, valid_set
def load_semi_supervised(n_labeled=1000, cut_off=1000, seed=123456, conv=False, extra=False):
    """
    Load the SVHN dataset where only a fraction of data points are labeled. The amount
    of labeled data will be evenly distributed accross classes.
    :param n_labeled: Number of labeled data points.
    :param cut_off: A cut off constant so that the data set is divisable.
    :param seed: The seed for the pseudo random shuffle of data points.
    :param conv: Boolean whether the images should be vectorized or not.
    :param extra: Include the extra set or not.
    :return: Train set unlabeled and labeled, test set, validation set.
    """

    rng = np.random.RandomState(seed=seed)
    train_set, test_set, valid_set = _download(extra)

    # Create the labeled and unlabeled data evenly distributed across classes.
    x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng)

    train_set = (x_u, y_u)
    train_set_labeled = (x_l, y_l)
    train_x, train_t = train_set

    # shuffle data
    train_collect = np.append(train_x, train_t, axis=1)
    rng.shuffle(train_collect)
    train_set = (train_collect[:, :-10], train_collect[:, -10:])

    train_set = cut_off_dataset(train_set, cut_off, rng)

    test_set = pad_targets(test_set)
    if valid_set is not None:
        valid_set = pad_targets(valid_set)

    if conv:
        train_set = _gen_conv(train_set)
        test_set = _gen_conv(test_set)
        if valid_set is not None:
            valid_set = _gen_conv(valid_set)

    return train_set, train_set_labeled, test_set, valid_set
Exemple #3
0
def load_semi_supervised(n_labeled=100, cut_off=100, seed=123456, expand_channels=False, remove_channels=False):
    """
    Load the NORB dataset where only a fraction of data points are labeled. The amount
    of labeled data will be evenly distributed accross classes.
    :param n_labeled: Number of labeled data points.
    :param cut_off: A cut off constant so that the data set is divisable.
    :param seed: The seed for the pseudo random shuffle of data points.
    :param expand_channels: The pairwise images are rolled out to one dataset.
    :param remove_channels: The second image in each pair is removed.
    :return: Train set unlabeled and labeled, test set, validation set.
    """

    rng = np.random.RandomState(seed=seed)
    train_set, test_set, valid_set = _download()

    # Create the labeled and unlabeled data evenly distributed across classes.
    x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng)

    if expand_channels:
        x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.))
        x_l_1 = x_l[:, 0, :]
        x_l_2 = x_l[:, 1, :]
        x_l = np.append(x_l_1, x_l_2, axis=0)
        y_l = np.append(y_l, y_l, axis=0)
        x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.))
        x_u_1 = x_u[:, 0, :]
        x_u_2 = x_u[:, 1, :]
        x_u = np.append(x_u_1, x_u_2, axis=0)
        y_u = np.append(y_u, y_u, axis=0)

        test_x, test_t = test_set
        test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.))
        test_x = np.append(test_x[:, 0, :], test_x[:, 1, :], axis=0)
        test_t = np.append(test_t, test_t, axis=0)
        test_set = (test_x, test_t)

        valid_x, valid_t = valid_set
        valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.))
        valid_x = np.append(valid_x[:, 0, :], valid_x[:, 1, :], axis=0)
        valid_t = np.append(valid_t, valid_t, axis=0)
        valid_set = (valid_x, valid_t)

    elif remove_channels:
        x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.))
        x_l = x_l[:, 0, :].reshape((-1, x_l.shape[-1]))
        x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.))
        x_u = x_u[:, 0, :].reshape((-1, x_u.shape[-1]))

        test_x, test_t = test_set
        test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.))
        test_x = test_x[:, 0, :].reshape((-1, test_x.shape[-1]))
        test_set = (test_x, test_t)

        valid_x, valid_t = valid_set
        valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.))
        valid_x = valid_x[:, 0, :].reshape((-1, valid_x.shape[-1]))
        valid_set = (valid_x, valid_t)

    train_set = (x_u, y_u)
    train_set_labeled = (x_l, y_l)
    train_x, train_t = train_set

    # shuffle data
    train_collect = np.append(train_x, train_t, axis=1)
    rng.shuffle(train_collect)
    train_set = (train_collect[:, :-5], train_collect[:, -5:])

    train_set = cut_off_dataset(train_set, cut_off, rng)

    test_set = pad_targets(test_set)
    if valid_set is not None:
        valid_set = pad_targets(valid_set)

    return train_set, train_set_labeled, test_set, valid_set
def load_semi_supervised(n_labeled=100, cut_off=100, seed=123456, expand_channels=False, remove_channels=False):
    """
    Load the NORB dataset where only a fraction of data points are labeled. The amount
    of labeled data will be evenly distributed accross classes.
    :param n_labeled: Number of labeled data points.
    :param cut_off: A cut off constant so that the data set is divisable.
    :param seed: The seed for the pseudo random shuffle of data points.
    :param expand_channels: The pairwise images are rolled out to one dataset.
    :param remove_channels: The second image in each pair is removed.
    :return: Train set unlabeled and labeled, test set, validation set.
    """

    rng = np.random.RandomState(seed=seed)
    train_set, test_set, valid_set = _download()

    # Create the labeled and unlabeled data evenly distributed across classes.
    x_l, y_l, x_u, y_u = create_semi_supervised(train_set, n_labeled, rng)

    if expand_channels:
        x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.))
        x_l_1 = x_l[:, 0, :]
        x_l_2 = x_l[:, 1, :]
        x_l = np.append(x_l_1, x_l_2, axis=0)
        y_l = np.append(y_l, y_l, axis=0)
        x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.))
        x_u_1 = x_u[:, 0, :]
        x_u_2 = x_u[:, 1, :]
        x_u = np.append(x_u_1, x_u_2, axis=0)
        y_u = np.append(y_u, y_u, axis=0)

        test_x, test_t = test_set
        test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.))
        test_x = np.append(test_x[:, 0, :], test_x[:, 1, :], axis=0)
        test_t = np.append(test_t, test_t, axis=0)
        test_set = (test_x, test_t)

        valid_x, valid_t = valid_set
        valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.))
        valid_x = np.append(valid_x[:, 0, :], valid_x[:, 1, :], axis=0)
        valid_t = np.append(valid_t, valid_t, axis=0)
        valid_set = (valid_x, valid_t)

    elif remove_channels:
        x_l = x_l.reshape((-1, 2, x_l.shape[1] / 2.))
        x_l = x_l[:, 0, :].reshape((-1, x_l.shape[-1]))
        x_u = x_u.reshape((-1, 2, x_u.shape[1] / 2.))
        x_u = x_u[:, 0, :].reshape((-1, x_u.shape[-1]))

        test_x, test_t = test_set
        test_x = test_x.reshape((-1, 2, test_x.shape[1] / 2.))
        test_x = test_x[:, 0, :].reshape((-1, test_x.shape[-1]))
        test_set = (test_x, test_t)

        valid_x, valid_t = valid_set
        valid_x = valid_x.reshape((-1, 2, valid_x.shape[1] / 2.))
        valid_x = valid_x[:, 0, :].reshape((-1, valid_x.shape[-1]))
        valid_set = (valid_x, valid_t)

    train_set = (x_u, y_u)
    train_set_labeled = (x_l, y_l)
    train_x, train_t = train_set

    # shuffle data
    train_collect = np.append(train_x, train_t, axis=1)
    rng.shuffle(train_collect)
    train_set = (train_collect[:, :-5], train_collect[:, -5:])

    train_set = cut_off_dataset(train_set, cut_off, rng)

    test_set = pad_targets(test_set)
    if valid_set is not None:
        valid_set = pad_targets(valid_set)

    return train_set, train_set_labeled, test_set, valid_set