def assignClasses(self): """ get labels of all samples. label == index number of cluster. """ clusterer = kdtree.KDTreeClustering(bucket_size=self.bucket_size) train_locs = self.df_train[['lat', 'lon']].values clusterer.fit(train_locs) clusters = clusterer.get_clusters() cluster_points = defaultdict(list) for i, cluster in enumerate(clusters): cluster_points[cluster].append(train_locs[i]) print('# the number of clusterer labels is: %d' % len(cluster_points)) self.cluster_median = OrderedDict() for cluster in sorted(cluster_points): points = cluster_points[cluster] median_lat = np.median([p[0] for p in points]) median_lon = np.median([p[1] for p in points]) self.cluster_median[cluster] = (median_lat, median_lon) dev_locs = self.df_dev[['lat', 'lon']].values test_locs = self.df_test[['lat', 'lon']].values nnbr = NearestNeighbors(n_neighbors=1, algorithm='brute', leaf_size=1, metric=haversine, n_jobs=4) nnbr.fit(np.array(list(self.cluster_median.values()))) self.dev_classes = nnbr.kneighbors(dev_locs, n_neighbors=1, return_distance=False)[:, 0] self.test_classes = nnbr.kneighbors(test_locs, n_neighbors=1, return_distance=False)[:, 0] self.train_classes = clusters if self.one_hot_labels: num_labels = np.max(self.train_classes) + 1 y_train = np.zeros((len(self.train_classes), num_labels), dtype=np.float32) y_train[np.arange(len(self.train_classes)), self.train_classes] = 1 y_dev = np.zeros((len(self.dev_classes), num_labels), dtype=np.float32) y_dev[np.arange(len(self.dev_classes)), self.dev_classes] = 1 y_test = np.zeros((len(self.test_classes), num_labels), dtype=np.float32) y_test[np.arange(len(self.test_classes)), self.test_classes] = 1 self.train_classes = y_train self.dev_classes = y_dev self.test_classes = y_test
def draw_kd_clusters(self, filename, figsize=(4, 3)): import matplotlib as mpl mpl.use('Agg') import matplotlib.patches as mpatches import matplotlib.pyplot as plt from mpl_toolkits.basemap import Basemap, cm, maskoceans #from matplotlib import style #import seaborn as sns #sns.set_style("white") #plt.rc('text', usetex=True) #plt.rc('font', family='serif') #plt.rcParams['axes.facecolor']='white' fig = plt.figure(figsize=figsize) lllat = 24.396308 lllon = -124.848974 urlat = 49.384358 urlon = -66.885444 m = Basemap(llcrnrlat=lllat, urcrnrlat=urlat, llcrnrlon=lllon, urcrnrlon=urlon, resolution='c', projection='cyl') m.drawmapboundary(fill_color='white') m.drawcoastlines(linewidth=0.2) m.drawcountries(linewidth=0.2) ax = plt.gca() #fig = plt.figure() # figsize=(4,4.2) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_visible(False) clusterer = kdtree.KDTreeClustering(bucket_size=self.bucket_size) train_locs = self.df_train[['lat', 'lon']].values mlon, mlat = m(*(train_locs[:, 1], train_locs[:, 0])) train_locs = np.transpose(np.vstack((mlat, mlon))) clusterer.fit(train_locs) clusters = clusterer.get_clusters() cluster_points = dd(list) for i, cluster in enumerate(clusters): cluster_points[cluster].append(train_locs[i]) corners = [] for i in clusters: points = np.vstack(cluster_points[i]) min_lat, min_lon = points.min(axis=0) max_lat, max_lon = points.max(axis=0) min_lon, min_lat = m(min_lon, min_lat) max_lon, max_lat = m(max_lon, max_lat) corners.append([min_lat, min_lon, max_lat, max_lon]) patches = [] for corner in corners: min_lat, min_lon, max_lat, max_lon = corner rect = mpatches.Rectangle((min_lon, min_lat), max_lon - min_lon, max_lat - min_lat, facecolor=None, fill=False, linewidth=0.7) patches.append(rect) ax.add_collection(PatchCollection(patches)) ax.set_xlim([-125, -60]) # pylab.xlim([-400, 400]) ax.set_ylim([25, 50]) plt.setp(ax.get_yticklabels(), visible=False) plt.setp(ax.get_xticklabels(), visible=False) ax.yaxis.set_tick_params(size=0) ax.xaxis.set_tick_params(size=0) #plt.tick_params(axis='both', which='major', labelsize=25) #ax.labelsize = '25' #plt.subplots_adjust(bottom=0.2) m.drawlsmask(land_color='gray', ocean_color="#b0c4de", lakes=True) plt.tight_layout() plt.savefig(filename) #plt.close() print("the plot saved in " + filename)
def assignClasses(self): # kd聚类器 clusterer = kdtree.KDTreeClustering(bucket_size=self.bucket_size) # 50 # <class 'tuple'>: (5685, 2) train_locs = self.df_train[['lat', 'lon']].values clusterer.fit(train_locs) clusters = clusterer.get_clusters() # <class 'tuple'>: (5685,) cluster_points = defaultdict(list) # 分类后user的经纬度 for i, cluster in enumerate(clusters): cluster_points[cluster].append( train_locs[i]) # train_locs未分类的5686个user的经纬度 print('# the number of clusterer labels is: %d' % len(cluster_points)) self.cluster_median = OrderedDict() for cluster in sorted(cluster_points): points = cluster_points[cluster] median_lat = np.median([p[0] for p in points]) median_lon = np.median([p[1] for p in points]) self.cluster_median[cluster] = ( median_lat, median_lon) # self.cluster_median129个类经纬度 dev_locs = self.df_dev[['lat', 'lon']].values # 1895经纬度 test_locs = self.df_test[['lat', 'lon']].values # nnbr = NearestNeighbors(n_neighbors=1, algorithm='brute', leaf_size=1, metric=haversine, n_jobs=4) nnbr = NearestNeighbors( n_neighbors=1, algorithm='brute', leaf_size=1, metric=haversine) nnbr.fit(np.array(list(self.cluster_median.values()))) ''' self.dev_classes<class 'tuple'>: (1895,) 样本对应的类? self.test_classes<class 'tuple'>: (1895,) 样本对应的类 self.train_classes<class 'tuple'>: (5685,) 样本对应的类 ''' self.dev_classes = nnbr.kneighbors( dev_locs, n_neighbors=1, return_distance=False)[:, 0] self.test_classes = nnbr.kneighbors( test_locs, n_neighbors=1, return_distance=False)[:, 0] self.train_classes = clusters if self.one_hot_labels: num_labels = np.max(self.train_classes) + 1 y_train = np.zeros( (len( self.train_classes), num_labels), dtype=np.float32) y_train[np.arange(len(self.train_classes)), self.train_classes] = 1 y_dev = np.zeros( (len( self.dev_classes), num_labels), dtype=np.float32) y_dev[np.arange(len(self.dev_classes)), self.dev_classes] = 1 y_test = np.zeros( (len( self.test_classes), num_labels), dtype=np.float32) y_test[np.arange(len(self.test_classes)), self.test_classes] = 1 self.train_classes = y_train self.dev_classes = y_dev self.test_classes = y_test