コード例 #1
0
class SemiDataSet(object):
    def __init__(self, features, labels, unlabeled_features):
        self.unlabeled_features = unlabeled_features

        # Unlabled DataSet
        self.unlabeled_ds = DataSet(features, labels)

        # Labeled DataSet
        self.num_examples = self.unlabeled_ds.num_examples
        indices = numpy.arange(self.num_examples)
        shuffled_indices = numpy.random.permutation(indices)
        features = features[shuffled_indices]
        labels = labels[shuffled_indices]
        y = numpy.array([numpy.arange(10)[l == 1][0] for l in labels])
        idx = indices[y == 0][:5]
        n_classes = y.max() + 1
        n_from_each_class = unlabeled_features / n_classes
        i_labeled = []
        for c in range(n_classes):
            i = indices[y == c][:n_from_each_class]
            i_labeled += list(i)
        l_images = features[i_labeled]
        l_labels = labels[i_labeled]
        self.labeled_ds = DataSet(l_images, l_labels)

    def next_batch(self, batch_size):
        unlabeled_images, _ = self.unlabeled_ds.next_batch(batch_size)
        if batch_size > self.unlabeled_features:
            labeled_images, labels = self.labeled_ds.next_batch(self.unlabeled_features)
        else:
            labeled_images, labels = self.labeled_ds.next_batch(batch_size)
        images = numpy.vstack([labeled_images, unlabeled_images])
        return images, labels
コード例 #2
0
ファイル: two_spirals.py プロジェクト: afcarl/tensordynamic
def get_two_spirals_data_set_collection():
    train_features, train_labels = two_spirals(2000)
    test_features, test_labels = two_spirals(1000)
    train = DataSet(train_features, train_labels)
    test = DataSet(test_features, test_labels)

    return DataSetCollection("two spirals", train, test, normalize=False)
コード例 #3
0
def get_cifar_100_data_set_collection(root_path=CIFAR_DATA_DIR,
                                      one_hot=True,
                                      use_fine_labels=True,
                                      validation_size=0,
                                      validation_ratio=None):
    """Get the cifar 100 data set requires files to be downloaded and extracted into cifar-100-python
    directory within root path

    Args:
        root_path (str):
        one_hot (bool): If True converts sparse labels to one hot encoding
        use_fine_labels (bool): If true use full 100 labels, if False use 10 categories

    Returns:
        DataSetCollection
    """
    root_path = root_path + "/cifar-100-python"

    features_train, labels_train = _load_cifar_100_set(root_path + "/train",
                                                       use_fine_labels)
    features_test, labels_test = _load_cifar_100_set(root_path + "/test",
                                                     use_fine_labels)

    if one_hot:
        num_classes = 100 if use_fine_labels else 10
        labels_train = dense_to_one_hot(labels_train, num_classes)
        labels_test = dense_to_one_hot(labels_test, num_classes)

    if not validation_size and validation_ratio:
        validation_size = int(
            (len(labels_train) + len(labels_test)) * validation_ratio)

    if validation_size:
        features_validation = features_train[:validation_size]
        labels_validation = labels_train[:validation_size]

        features_train = features_train[validation_size:]
        labels_train = labels_train[validation_size:]
        validation = DataSet(features_validation,
                             labels_validation,
                             to_binary=True)
    else:
        validation = None

    train = DataSet(features_train, labels_train, to_binary=True)

    test = DataSet(features_test, labels_test, to_binary=True)

    collection = DataSetCollection('CIFAR-100' +
                                   ('-fine' if use_fine_labels else '-coarse'),
                                   train,
                                   test,
                                   validation=validation,
                                   normalize=True)

    return collection
コード例 #4
0
ファイル: test_data_set.py プロジェクト: afcarl/tensordynamic
    def test_one_batch_iteration_exact_partial_batch(self):
        batch_size = 10
        data_set = DataSet(np.random.normal(size=(25, 10)),
                           np.random.normal(size=(25, 1)))

        results = list(data_set.one_iteration_in_batches(batch_size))

        self.assertEqual(len(results), 2)
        self.assertEqual(len(results[0][0]), batch_size)
        self.assertEqual(len(results[0][1]), batch_size)
        self.assertEqual(len(results[-1][0]), batch_size)
        self.assertEqual(len(results[-1][1]), batch_size)
コード例 #5
0
def get_cifar_10_data_set_collection(root_path=CIFAR_DATA_DIR,
                                     one_hot=True,
                                     validation_size=0,
                                     validation_ratio=None):
    """Get the cifar 100 data set requires files to be downloaded and extracted into cifar-10-batches-py
    directory within root path

    Args:
        root_path (str):
        one_hot (bool): If True converts sparse labels to one hot encoding

    Returns:
        DataSetCollection
    """
    root_path += "/cifar-10-batches-py"

    features_train, labels_train, features_test, labels_test = _load(root_path)

    if one_hot:
        labels_train = dense_to_one_hot(labels_train)
        labels_test = dense_to_one_hot(labels_test)

    if not validation_size and validation_ratio:
        validation_size = int(
            (len(labels_train) + len(labels_test)) * validation_ratio)

    if validation_size:
        features_validation = features_train[validation_size:]
        labels_validation = labels_train[validation_size:]

        features_train = features_train[validation_size:]
        labels_train = labels_train[validation_size:]
        validation = DataSet(features_validation,
                             labels_validation,
                             to_binary=True)
    else:
        validation = None

    train = DataSet(features_train, labels_train, to_binary=True)

    test = DataSet(features_test, labels_test, to_binary=True)

    collection = DataSetCollection('CIFAR-10',
                                   train,
                                   test,
                                   validation=validation,
                                   normalize=True)

    return collection
コード例 #6
0
    def __init__(self, features, labels, unlabeled_features):
        self.unlabeled_features = unlabeled_features

        # Unlabled DataSet
        self.unlabeled_ds = DataSet(features, labels)

        # Labeled DataSet
        self.num_examples = self.unlabeled_ds.num_examples
        indices = numpy.arange(self.num_examples)
        shuffled_indices = numpy.random.permutation(indices)
        features = features[shuffled_indices]
        labels = labels[shuffled_indices]
        y = numpy.array([numpy.arange(10)[l == 1][0] for l in labels])
        idx = indices[y == 0][:5]
        n_classes = y.max() + 1
        n_from_each_class = unlabeled_features / n_classes
        i_labeled = []
        for c in range(n_classes):
            i = indices[y == c][:n_from_each_class]
            i_labeled += list(i)
        l_images = features[i_labeled]
        l_labels = labels[i_labeled]
        self.labeled_ds = DataSet(l_images, l_labels)
コード例 #7
0
ファイル: test_data_set.py プロジェクト: afcarl/tensordynamic
    def test_num_examples(self):
        data_set = DataSet(np.random.normal(size=(100, 10)),
                           np.random.normal(size=(100, 1)))

        self.assertEqual(data_set.num_examples, 100)
コード例 #8
0
def get_xor_data_set_collection():
    features, labels = xor()
    train = DataSet(features, labels)
    test = DataSet(features, labels)

    return DataSetCollection("xor", train, test, normalize=False)
コード例 #9
0
def get_mnist_data_set_collection(train_dir=os.path.dirname(__file__) +
                                  "/MNIST_data",
                                  number_labeled_examples=None,
                                  one_hot=True,
                                  validation_size=0,
                                  validation_ratio=None,
                                  limit_train_size=None,
                                  flatten=True):
    """Load mnist data

    Args:
        train_dir (str): directory to store the downloaded data, or to where it has previously been downloaded
        number_labeled_examples (int): For semi supervised learning, how many labels to use, if None we use supervised
            learning
        one_hot (bool): If True labels will be one hot vectors, not ints
        validation_size (int): Number of items to move to validation set
        limit_train_size (int): If set limit number of training items to this
        flatten (bool): If true data set is flattened to simply be array of image values, not 3d array of
            [width, height, depth]

    Returns:
        DataSetCollection
    """
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = _maybe_download(TRAIN_IMAGES, train_dir)
    train_images = _extract_images(local_file)

    local_file = _maybe_download(TRAIN_LABELS, train_dir)
    train_labels = _extract_labels(local_file, one_hot=one_hot)

    local_file = _maybe_download(TEST_IMAGES, train_dir)
    test_images = _extract_images(local_file)

    local_file = _maybe_download(TEST_LABELS, train_dir)
    test_labels = _extract_labels(local_file, one_hot=one_hot)

    if not validation_size and validation_ratio:
        validation_size = int(
            (len(train_labels) + len(test_labels)) * validation_ratio)

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]

    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    if limit_train_size:
        train_images = train_images[:limit_train_size]
        train_labels = train_labels[:limit_train_size]

    if number_labeled_examples is None:
        train = DataSet(train_images,
                        train_labels,
                        flatten=flatten,
                        to_binary=True)
    else:
        train = SemiDataSet(train_images, train_labels,
                            number_labeled_examples)

    test = DataSet(test_images, test_labels, flatten=flatten, to_binary=True)

    if validation_size:
        validation = DataSet(validation_images,
                             validation_labels,
                             flatten=flatten,
                             to_binary=True)
    else:
        validation = None

    return DataSetCollection('MNIST', train, test, validation)