def hclust(pts, distance, clust_method="single", use_great_circle=True):
    """ Hierarchical clustering method.  Exposes R hclust function in Python.
    Algorithm:
    - Calculate distances for each pair of input pts
    (use_great_circle:  TRUE then use great_circle else use Euclidean)
    - Cluster the points via hclust with distances and clust_method
    - Cut the resultant clusters via cutree at the desired distance
    - distance should be in kilometers IF use_great_circle=TRUE, otherwise,
    they should be in the same metric as the input pts. 
    (see R doc for sp.spDist)
    - Return the set of sets of points (i.e. the clusters)
    """

    rpy2.robjects.numpy2ri.activate() 

    pt_array = numpy.array(pts)
    r.library('sp')
    sp_dists = r.spDists(pt_array, longlat=use_great_circle)
    dists = r('as.dist')(sp_dists)
    tree = r.hclust(dists, method=clust_method)
    clusters = r.cutree(tree, h=distance)
    
    # this is a little tricky
    # clusters maintains a list of indexes for each point in the original list.  
    # The indexes represent the cluster number
    # For example: 
    #   clusters = [1, 3, 3, 1, 2, 1]
    #
    # where the 0th, 3rd, and last point in the original set
    # belong to cluster number 1
    #
    # We want to return a list of clusters, each containing
    # the index to the original point, so we map them here.  
    #
    # Things are a little more confusing since R counts arrays
    # from 1, python from 0 (hence the "- 1" from the cluster index)
    list_of_pts = [[] for i in range(max(clusters))]
    for j in range(0, len(clusters)):
        list_of_pts[clusters[j] - 1].append(j)

    
    return list_of_pts
#activate r to numpy array auto-conversion
rpy2.robjects.numpy2ri.activate()

#load sp library
r.library('sp')

to_x = numpy.random.rand(100) * 100
to_y = numpy.random.rand(100) * 100

xy = zip(to_x, to_y)

xy_array = numpy.array(xy)

dists = r.spDists(xy_array)
tree = r.hclust(r('as.dist')(dists), "average")
clusts = r.cutree(tree, h=100.0)

np_clusts = numpy.array(clusts)

clusters = [[] for i in range(max(np_clusts))]
for i in range(0, len(np_clusts)):
   clusters[np_clusts[i] - 1].append(xy_array[i])

cluster1 = clusters[len(clusters) - 1]
cluster1_cols = numpy.array(cluster1)
print cluster1_cols
print cluster1[0]
dists = r.spDistsN1(cluster1_cols, cluster1[0])
print dists
Exemple #3
0
                  rownames=['true'],
                  colnames=['predicted'])

# <headingcell level=4>

# Bridging the gap with Rpy2

# <codecell>

from rpy2.robjects import r
from rpy2.robjects.numpy2ri import numpy2ri as np2r

Xr = np2r(iris[['PW', 'PL', 'SW']].values)
d = r.dist(Xr)
tree = r.hclust(d, method='ward')
yhat_hclust = r.cutree(tree, k=3)

print pd.crosstab(iris['Type'],
                  yhat_hclust,
                  rownames=['true'],
                  colnames=['predicted'])

# <headingcell level=4>

# Using non-base packages in Rpy2

# <codecell>

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
r = robjects.r
yhat_new = new_label[yhat]
print pd.crosstab(iris['Type'], yhat_new, rownames=['true'], colnames=['predicted'])

# <headingcell level=4>

# Bridging the gap with Rpy2

# <codecell>

from rpy2.robjects import r
from rpy2.robjects.numpy2ri import numpy2ri as np2r

Xr = np2r(iris[['PW', 'PL', 'SW']].values)
d = r.dist(Xr)
tree = r.hclust(d, method='ward')
yhat_hclust = r.cutree(tree, k=3)

print pd.crosstab(iris['Type'], yhat_hclust, rownames=['true'], colnames=['predicted'])

# <headingcell level=4>

# Using non-base packages in Rpy2

# <codecell>

import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
r = robjects.r

e1071 = importr('e1071')
Yr = np2r(iris['Type'])