/
gaussianClusterTest.py
37 lines (29 loc) · 1.95 KB
/
gaussianClusterTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#! /usr/bin/env python
from numpy import *
from scipy.stats import kstest
from scipy.cluster.vq import *
from numpy.random import randn
from scipy.stats import anderson
# Define the g-means algorithm
# This will test the given dataset (r) for gaussianity along the axis
# that k-means determines is the most important for clustering.
# The return value (AD_stat) will be large if the cluster is nongaussian.
# will be small if the cluster is gaussian.
def gaussianClusterTest(r):
d,n = shape(r)
# If the dataset is only one point, return a large number to
# indicate that we have over-fit this cluster
if d<=1:
return -1,0,0
# Run k-means with k=2
c=2
centroids,labels = kmeans2(r,c,minit='points')
# Find the axis along which the two centroids lie
v = centroids[1,:] - centroids[0,:]
# Get the projection of each point in r along v
x = empty((n))
x = dot(r,v)/linalg.norm(v)
AD_stat,AD_criticals,AD_percent = anderson(x,'norm')
# This is the 5% significance level
AD_crit = AD_criticals[2]
return AD_stat,AD_crit,labels