def select_Z(dataset, OPTION_NZ):
    from scipy.cluster.vq import kmeans as scipy_kmeans
    
    np.random.seed(seed=149221)
    Z, _ = scipy_kmeans(dataset.xtrain, OPTION_NZ)

    return (Z)
Exemple #2
0
def select_Z(dataset, OPTION_NZ):
    from scipy.cluster.vq import kmeans as scipy_kmeans

    np.random.seed(seed=149221)
    Z, _ = scipy_kmeans(dataset.xtrain, OPTION_NZ)

    return (Z)
Exemple #3
0
def load_data(seed, ntrain, datasetName, num_inducing):
    d = io.loadmat('benchmarks.mat')[datasetName][0, 0]
    x, y = d[0], d[1]
    y = np.where(y == 1, 1, 0)  # data is stored as +-1, we use 1, 0

    # split into train, test sets
    np.random.seed(seed)
    index = np.random.permutation(x.shape[0])
    itrain, itest = index[:ntrain], index[ntrain:]
    xtrain, xtest = x[itrain], x[itest]
    ytrain, ytest = y[itrain], y[itest]

    # normalize using training data mean, std
    xmean, xstd = xtrain.mean(0), xtrain.std(0)
    xstd = np.where(xstd > 1e-6, xstd, 1.)
    xtrain, xtest = (xtrain-xmean)/xstd, (xtest-xmean)/xstd
    Z, _ = scipy_kmeans(xtrain, num_inducing)
    return dict(Xtrain=xtrain, Ytrain=ytrain, Xtest=xtest, Ytest=ytest, Z=Z)
Exemple #4
0
def load_data(seed, ntrain, datasetName, num_inducing):
    d = io.loadmat('benchmarks.mat')[datasetName][0, 0]
    x, y = d[0], d[1]
    y = np.where(y == 1, 1, 0)  # data is stored as +-1, we use 1, 0

    # split into train, test sets
    np.random.seed(seed)
    index = np.random.permutation(x.shape[0])
    itrain, itest = index[:ntrain], index[ntrain:]
    xtrain, xtest = x[itrain], x[itest]
    ytrain, ytest = y[itrain], y[itest]

    # normalize using training data mean, std
    xmean, xstd = xtrain.mean(0), xtrain.std(0)
    xstd = np.where(xstd > 1e-6, xstd, 1.)
    xtrain, xtest = (xtrain - xmean) / xstd, (xtest - xmean) / xstd
    Z, _ = scipy_kmeans(xtrain, num_inducing)
    return dict(Xtrain=xtrain, Ytrain=ytrain, Xtest=xtest, Ytest=ytest, Z=Z)
f.close()
del f

d2 = datetime.now()
print "Loading time was: %d.%d" % ((d2-d1).seconds, (d2-d1).microseconds)
# -------------------------------
K = int(sys.argv[2])

print 'Starting clusterig method...'

d1 = datetime.now()

if len(sys.argv)>3 and '--scipy' in sys.argv:
    print "Scipy kmeans"
    centroids, labels = scipy_kmeans(m, K, minit='points')
else:
    print "Opencv kmeans"
    samples = cv.fromarray(m)
    labels = cv.CreateMat(samples.height, 1, cv.CV_32SC1)
#    crit = (cv.CV_TERMCRIT_EPS + cv.CV_TERMCRIT_ITER, 10, 1.0)
    crit = (cv.CV_TERMCRIT_ITER, 10, 0)
    cv.KMeans2(samples, K, labels, crit)


d2 = datetime.now()
print "Elapsed time for %d clusters: %d.%d" % (K, (d2-d1).seconds, (d2-d1).microseconds)

print 'Updating HDF file with results...'

d1 = datetime.now()
for i, cluster in enumerate(clusters):
    print "Cluster", i, len(cluster.data_objects)

print "Silhouette score", cluster_evaluation.silhouette_score(clusters)

time_2 = time.time()

print time_2 - time_1, "seconds"

print

time_1 = time.time()

print "Scipy kmeans"

data_objects = data_objects.ix[:, 0:7].as_matrix()

codebook, distortion = scipy_kmeans(data_objects, k)

code, distortion = vq(data_objects, codebook)

for i in range(k):
    print "Cluster", i, len(filter(lambda x:x==i, code))

print "Silhouette score", silhouette_score(data_objects, code)

time_2 = time.time()

print time_2 - time_1, "seconds"