Example #1
0
def test_compute_embedding(check_asserts=True):
    np.random.seed(0)
    random.seed(0)

    # sample data from the digits 8x8 pixels dataset
    digits = datasets.load_digits()
    data = digits.data
    n_samples, n_features = data.shape
    low_dim = 2

    # baseline score using a random 2D projection
    projection = random_project(data, target_dim=2, rng=np.random)
    score = local_match(data, projection, query_size=50, ratio=0.1, seed=0)
    assert_almost_equal(score, 0.12, 2)

    # compute an embedding of the data
    embedder = SDAEmbedder(
        (n_features, 40, 15, low_dim),
        noise=0.1,
        reconstruction_penalty=0.0,
        embedding_penalty=1,
        sparsity_penalty=0.0,
        learning_rate=0.1,
        seed=0,
    )
    embedder.pre_train(data, epochs=500, batch_size=5)

    code = embedder.encode(data)
    assert_equal(code.shape, (n_samples, low_dim))

    # compare nearest neighbors
    score = local_match(data, code, query_size=50, ratio=0.1, seed=0)

    assert_almost_equal(score, 0.33, 1)
Example #2
0
pl.clf()

n_features = 30
n_samples = 1000

print "Generating embedded swissroll with n_features=%d and n_samples=%d" % (
    n_features, n_samples)

data, manifold = swissroll.load(
    n_features=n_features,
    n_samples=n_samples,
    n_turns=1.2,
    radius=1.,
    hole=False,
)
score_manifold_data = local_match(
    data, manifold, query_size=50, ratio=1, seed=0)
print "kNN score match manifold/data:", score_manifold_data

# build model to extract the manifold and learn a mapping / encoder to be able
# to reproduce this on test data
embedder = SDAEmbedder((n_features, 10, 2),
                       noise=0.1,
                       reconstruction_penalty=1.0,
                       embedding_penalty=0.1,
                       sparsity_penalty=0.0,
                       learning_rate=0.1, seed=0)

# use the randomly initialized encoder to measure the baseline
code = embedder.encode(data)
score_code_data = local_match(data, code, query_size=50, ratio=1, seed=0)
print "kNN score match after pre-training code/data:", score_code_data
Example #3
0
n_samples = 5000
n_manifolds = 100

print "Generating %d embedded swissrolls with n_features=%d, n_samples=%d" % (
    n_manifolds, n_features, n_samples)

data, manifolds, colors = multirolls.load(
    n_features=n_features,
    n_samples=n_samples,
    n_manifolds=n_manifolds,
)

# compute a baseline evaluation of the manifolds (ground truth)
stacked_manifolds = np.vstack([m + [10 * i, 0]
                               for i, m in enumerate(manifolds)])
score = local_match(data, stacked_manifolds, query_size=50, ratio=1, seed=0)
print "kNN score match manifolds/data (ground truth):", score

# compute the score of a projection
projection = random_project(data, target_dim=2, rng=np.random)
score = local_match(data, projection, query_size=50, ratio=1, seed=0)
print "kNN score match projection/data (baseline):", score

# reshuffle the data since stochastic gradient descent assumes I.I.D. samples
perm = np.random.permutation(data.shape[0])
data, colors = data[perm], colors[perm]

# build model to extract the manifolds and learn a mapping / encoder to be able
# to reproduce this on test data
embedder = SDAEmbedder((n_features, 30, 10, 2), noise=0.1,
                       embedding_penalty=1.0,
Example #4
0
                       embedding_penalty=0.0,
                       sparsity_penalty=0.0,
                       learning_rate=0.1, seed=0)

print "Training encoder to extract a semantic preserving 2D mapping"
start = time.time()
embedder.pre_train(data, slice_=slice(None, None), epochs=1000, batch_size=100)
print "done in %ds" % (time.time() - start)

# evaluation of the quality of the embedding by comparing kNN queries from the
# original (high dim) data and the low dim code on the one hand, and from the
# ground truth low dim manifold and the low dim code on the other hand

fig = pl.figure(1)
code = embedder.encode(data)
score_code_data = local_match(data, code, query_size=50, ratio=1, seed=0)
print "kNN score match after pre-training code/data:", score_code_data
_, _, corr = pairwise_distances(data, code, ax=fig.add_subplot(1, 1, 1),
                                title="pre-training")
print "Pairwise distances correlation:", corr

## fine tuning
#print "Fine tuning encoder to unroll the embedded data..."
#start = time.time()
#embedder.fine_tune(data, epochs=100, batch_size=5)
#print "done in %ds" % (time.time() - start)

#code = embedder.encode(data)
#score_code_data = local_match(data, code, query_size=50, ratio=1, seed=0)
#print "kNN score match after fine-tuning code/data:", score_code_data
#_, _, corr = pairwise_distances(data, code, ax=fig.add_subplot(3, 1, 3),