Esempio n. 1
0
    def assignClasses(self):
        """
            get labels of all samples. label == index number of cluster.
        """
        clusterer = kdtree.KDTreeClustering(bucket_size=self.bucket_size)
        train_locs = self.df_train[['lat', 'lon']].values
        clusterer.fit(train_locs)
        clusters = clusterer.get_clusters()
        cluster_points = defaultdict(list)
        for i, cluster in enumerate(clusters):
            cluster_points[cluster].append(train_locs[i])
        print('# the number of clusterer labels is: %d' % len(cluster_points))
        self.cluster_median = OrderedDict()
        for cluster in sorted(cluster_points):
            points = cluster_points[cluster]
            median_lat = np.median([p[0] for p in points])
            median_lon = np.median([p[1] for p in points])
            self.cluster_median[cluster] = (median_lat, median_lon)
        dev_locs = self.df_dev[['lat', 'lon']].values
        test_locs = self.df_test[['lat', 'lon']].values
        nnbr = NearestNeighbors(n_neighbors=1,
                                algorithm='brute',
                                leaf_size=1,
                                metric=haversine,
                                n_jobs=4)
        nnbr.fit(np.array(list(self.cluster_median.values())))
        self.dev_classes = nnbr.kneighbors(dev_locs,
                                           n_neighbors=1,
                                           return_distance=False)[:, 0]
        self.test_classes = nnbr.kneighbors(test_locs,
                                            n_neighbors=1,
                                            return_distance=False)[:, 0]

        self.train_classes = clusters

        if self.one_hot_labels:
            num_labels = np.max(self.train_classes) + 1
            y_train = np.zeros((len(self.train_classes), num_labels),
                               dtype=np.float32)
            y_train[np.arange(len(self.train_classes)), self.train_classes] = 1
            y_dev = np.zeros((len(self.dev_classes), num_labels),
                             dtype=np.float32)
            y_dev[np.arange(len(self.dev_classes)), self.dev_classes] = 1
            y_test = np.zeros((len(self.test_classes), num_labels),
                              dtype=np.float32)
            y_test[np.arange(len(self.test_classes)), self.test_classes] = 1
            self.train_classes = y_train
            self.dev_classes = y_dev
            self.test_classes = y_test
Esempio n. 2
0
    def draw_kd_clusters(self, filename, figsize=(4, 3)):
        import matplotlib as mpl
        mpl.use('Agg')
        import matplotlib.patches as mpatches
        import matplotlib.pyplot as plt
        from mpl_toolkits.basemap import Basemap, cm, maskoceans
        #from matplotlib import style
        #import seaborn as sns
        #sns.set_style("white")
        #plt.rc('text', usetex=True)
        #plt.rc('font', family='serif')
        #plt.rcParams['axes.facecolor']='white'
        fig = plt.figure(figsize=figsize)

        lllat = 24.396308
        lllon = -124.848974
        urlat = 49.384358
        urlon = -66.885444
        m = Basemap(llcrnrlat=lllat,
                    urcrnrlat=urlat,
                    llcrnrlon=lllon,
                    urcrnrlon=urlon,
                    resolution='c',
                    projection='cyl')
        m.drawmapboundary(fill_color='white')
        m.drawcoastlines(linewidth=0.2)
        m.drawcountries(linewidth=0.2)

        ax = plt.gca()
        #fig = plt.figure()  # figsize=(4,4.2)

        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.spines['left'].set_visible(False)
        clusterer = kdtree.KDTreeClustering(bucket_size=self.bucket_size)
        train_locs = self.df_train[['lat', 'lon']].values
        mlon, mlat = m(*(train_locs[:, 1], train_locs[:, 0]))
        train_locs = np.transpose(np.vstack((mlat, mlon)))

        clusterer.fit(train_locs)
        clusters = clusterer.get_clusters()
        cluster_points = dd(list)
        for i, cluster in enumerate(clusters):
            cluster_points[cluster].append(train_locs[i])
        corners = []
        for i in clusters:
            points = np.vstack(cluster_points[i])
            min_lat, min_lon = points.min(axis=0)
            max_lat, max_lon = points.max(axis=0)
            min_lon, min_lat = m(min_lon, min_lat)
            max_lon, max_lat = m(max_lon, max_lat)
            corners.append([min_lat, min_lon, max_lat, max_lon])
        patches = []
        for corner in corners:
            min_lat, min_lon, max_lat, max_lon = corner
            rect = mpatches.Rectangle((min_lon, min_lat),
                                      max_lon - min_lon,
                                      max_lat - min_lat,
                                      facecolor=None,
                                      fill=False,
                                      linewidth=0.7)
            patches.append(rect)
        ax.add_collection(PatchCollection(patches))
        ax.set_xlim([-125, -60])  # pylab.xlim([-400, 400])
        ax.set_ylim([25, 50])

        plt.setp(ax.get_yticklabels(), visible=False)
        plt.setp(ax.get_xticklabels(), visible=False)
        ax.yaxis.set_tick_params(size=0)
        ax.xaxis.set_tick_params(size=0)
        #plt.tick_params(axis='both', which='major', labelsize=25)
        #ax.labelsize = '25'
        #plt.subplots_adjust(bottom=0.2)
        m.drawlsmask(land_color='gray', ocean_color="#b0c4de", lakes=True)
        plt.tight_layout()
        plt.savefig(filename)
        #plt.close()
        print("the plot saved in " + filename)
Esempio n. 3
0
    def assignClasses(self):
        # kd聚类器
        clusterer = kdtree.KDTreeClustering(bucket_size=self.bucket_size)  # 50
        # <class 'tuple'>: (5685, 2)
        train_locs = self.df_train[['lat', 'lon']].values
        clusterer.fit(train_locs)
        clusters = clusterer.get_clusters()  # <class 'tuple'>: (5685,)
        cluster_points = defaultdict(list)  # 分类后user的经纬度
        for i, cluster in enumerate(clusters):
            cluster_points[cluster].append(
                train_locs[i])  # train_locs未分类的5686个user的经纬度
        print('# the number of clusterer labels is: %d' % len(cluster_points))
        self.cluster_median = OrderedDict()
        for cluster in sorted(cluster_points):
            points = cluster_points[cluster]
            median_lat = np.median([p[0] for p in points])
            median_lon = np.median([p[1] for p in points])
            self.cluster_median[cluster] = (
                median_lat, median_lon)  # self.cluster_median129个类经纬度
        dev_locs = self.df_dev[['lat', 'lon']].values  # 1895经纬度
        test_locs = self.df_test[['lat', 'lon']].values
        # nnbr = NearestNeighbors(n_neighbors=1, algorithm='brute', leaf_size=1, metric=haversine, n_jobs=4)
        nnbr = NearestNeighbors(
            n_neighbors=1,
            algorithm='brute',
            leaf_size=1,
            metric=haversine)
        nnbr.fit(np.array(list(self.cluster_median.values())))
        '''
		self.dev_classes<class 'tuple'>: (1895,) 样本对应的类?
		self.test_classes<class 'tuple'>: (1895,) 样本对应的类
		self.train_classes<class 'tuple'>: (5685,) 样本对应的类
		'''
        self.dev_classes = nnbr.kneighbors(
            dev_locs, n_neighbors=1, return_distance=False)[:, 0]
        self.test_classes = nnbr.kneighbors(
            test_locs, n_neighbors=1, return_distance=False)[:, 0]

        self.train_classes = clusters

        if self.one_hot_labels:
            num_labels = np.max(self.train_classes) + 1
            y_train = np.zeros(
                (len(
                    self.train_classes),
                    num_labels),
                dtype=np.float32)
            y_train[np.arange(len(self.train_classes)), self.train_classes] = 1
            y_dev = np.zeros(
                (len(
                    self.dev_classes),
                    num_labels),
                dtype=np.float32)
            y_dev[np.arange(len(self.dev_classes)), self.dev_classes] = 1
            y_test = np.zeros(
                (len(
                    self.test_classes),
                    num_labels),
                dtype=np.float32)
            y_test[np.arange(len(self.test_classes)), self.test_classes] = 1
            self.train_classes = y_train
            self.dev_classes = y_dev
            self.test_classes = y_test