def hclust(pts, distance, clust_method="single", use_great_circle=True): """ Hierarchical clustering method. Exposes R hclust function in Python. Algorithm: - Calculate distances for each pair of input pts (use_great_circle: TRUE then use great_circle else use Euclidean) - Cluster the points via hclust with distances and clust_method - Cut the resultant clusters via cutree at the desired distance - distance should be in kilometers IF use_great_circle=TRUE, otherwise, they should be in the same metric as the input pts. (see R doc for sp.spDist) - Return the set of sets of points (i.e. the clusters) """ rpy2.robjects.numpy2ri.activate() pt_array = numpy.array(pts) r.library('sp') sp_dists = r.spDists(pt_array, longlat=use_great_circle) dists = r('as.dist')(sp_dists) tree = r.hclust(dists, method=clust_method) clusters = r.cutree(tree, h=distance) # this is a little tricky # clusters maintains a list of indexes for each point in the original list. # The indexes represent the cluster number # For example: # clusters = [1, 3, 3, 1, 2, 1] # # where the 0th, 3rd, and last point in the original set # belong to cluster number 1 # # We want to return a list of clusters, each containing # the index to the original point, so we map them here. # # Things are a little more confusing since R counts arrays # from 1, python from 0 (hence the "- 1" from the cluster index) list_of_pts = [[] for i in range(max(clusters))] for j in range(0, len(clusters)): list_of_pts[clusters[j] - 1].append(j) return list_of_pts
#activate r to numpy array auto-conversion rpy2.robjects.numpy2ri.activate() #load sp library r.library('sp') to_x = numpy.random.rand(100) * 100 to_y = numpy.random.rand(100) * 100 xy = zip(to_x, to_y) xy_array = numpy.array(xy) dists = r.spDists(xy_array) tree = r.hclust(r('as.dist')(dists), "average") clusts = r.cutree(tree, h=100.0) np_clusts = numpy.array(clusts) clusters = [[] for i in range(max(np_clusts))] for i in range(0, len(np_clusts)): clusters[np_clusts[i] - 1].append(xy_array[i]) cluster1 = clusters[len(clusters) - 1] cluster1_cols = numpy.array(cluster1) print cluster1_cols print cluster1[0] dists = r.spDistsN1(cluster1_cols, cluster1[0]) print dists
rownames=['true'], colnames=['predicted']) # <headingcell level=4> # Bridging the gap with Rpy2 # <codecell> from rpy2.robjects import r from rpy2.robjects.numpy2ri import numpy2ri as np2r Xr = np2r(iris[['PW', 'PL', 'SW']].values) d = r.dist(Xr) tree = r.hclust(d, method='ward') yhat_hclust = r.cutree(tree, k=3) print pd.crosstab(iris['Type'], yhat_hclust, rownames=['true'], colnames=['predicted']) # <headingcell level=4> # Using non-base packages in Rpy2 # <codecell> import rpy2.robjects as robjects from rpy2.robjects.packages import importr r = robjects.r
yhat_new = new_label[yhat] print pd.crosstab(iris['Type'], yhat_new, rownames=['true'], colnames=['predicted']) # <headingcell level=4> # Bridging the gap with Rpy2 # <codecell> from rpy2.robjects import r from rpy2.robjects.numpy2ri import numpy2ri as np2r Xr = np2r(iris[['PW', 'PL', 'SW']].values) d = r.dist(Xr) tree = r.hclust(d, method='ward') yhat_hclust = r.cutree(tree, k=3) print pd.crosstab(iris['Type'], yhat_hclust, rownames=['true'], colnames=['predicted']) # <headingcell level=4> # Using non-base packages in Rpy2 # <codecell> import rpy2.robjects as robjects from rpy2.robjects.packages import importr r = robjects.r e1071 = importr('e1071') Yr = np2r(iris['Type'])