def kmeans(data, **kwargs): """ Perform k-means clustering on unstructured N-dimensional data. @type data: array @param data: The data to be clustered @type kwargs: dict @param kwargs: The following args are accepted: - numClusters: The number of clusters to form (returned number of clusters may be less than k). - npasses: The number of times the k-means clustering algorithm is performed, each time with a different (random) initial condition. - method: describes how the center of a cluster is found: - method=='a': arithmetic mean. - method=='m': median. - initialCenters: a set of points that should be used as the initial cluster centers @rtype: tuple @return: A list where each element indicates the cluster membership of the corresponding index in the original data and a message string """ k = 1 npasses = 1 method = 'a' initialCenters = None smartCenters = False msg = '' if 'numClusters' in kwargs: k = int(kwargs['numClusters']) if 'npasses' in kwargs: npasses = int(kwargs['npasses']) if 'method' in kwargs: method = kwargs['method'] if 'initialCenters' in kwargs: initialCenters = kwargs['initialCenters'] if 'smartCenters' in kwargs: smartCenters = kwargs['smartCenters'] logData = tm.getMethod('log')(data) if initialCenters is not None: (clusterIDs, err, nOpt) = pc.kcluster(logData, k, npass=npasses, method=method) msg = "Number of rounds optimal solution was found: %i" % nOpt else: logCenters = tm.getMethod('log')(np.array(initialCenters[:k])) (centroids, clusterIDs) = kmeans2(logData, logCenters, minit='matrix') if len(np.unique(clusterIDs)) < k: wx.MessageBox('Warning: One or more of the returned clusters are empty. Please choose different initial cluster centers and re-run k-means for better results.', 'Insufficiently varied cluster centers', wx.OK | wx.ICON_WARNING) return clusterIDs, msg
def heatmap2d(subplot, figure, dims): """ heatmap2d; 2D Heatmap; Plots a 2D heatmap of the data with event density as the main indicator. """ opts = subplot.opts if len(opts) == 0: opts['type'] = 'Hexbins' opts['colorMap'] = 'gist_earth' opts['bins'] = (200, 200) opts['transform'] = 'log' opts['transformAuto'] = True subplot.axes = figure.add_subplot(subplot.mnp, title=subplot.Title) subplot.axes.set_xlabel(subplot.Labels[dims[0]]) subplot.axes.set_ylabel(subplot.Labels[dims[1]]) x = subplot.Data[:, dims[0]] y = subplot.Data[:, dims[1]] cbLabel = '' # apply transform if 'transform' not in opts: opts['transform'] = 'log' if opts['transform'] == 'log': x = tm.getMethod('log')(x) y = tm.getMethod('log')(y) extent = (0, x.max()*1.05, 0, y.max()*1.05) cmap = CM.get_cmap(opts['colorMap']) if opts['type'] == 'Hexbins': gAx = subplot.axes.hexbin(x, y, gridsize=opts['bins'][0], extent=extent, mincnt=1, cmap=cmap) cbLabel = 'Events' # The following two means of calculating the heat map do not # work correctly yet and are not enabled for selection. if opts['type'] == 'Gaussian KDE': kdeGrid = fast_kde(x, y, gridsize=opts['bins']) gAx = subplot.axes.imshow(kdeGrid, extent=extent, cmap=cmap, aspect='auto', interpolation='bicubic') if opts['type'] == 'Histogram': heatmap, xedges, yedges = np.histogram2d(x, y, bins=opts['bins']) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] gAx = subplot.axes.imshow(heatmap, extent=extent, cmap=cmap, aspect='auto') cb = subplot.parent.colorbar(gAx) if cbLabel == '': cbLabel = 'Events' if opts['transform'] != 'log' else 'log Events' cb.set_label(cbLabel)
def histogram(subplot, figure, dims): """ histogram; Histogram; Plots a 1D Histogram """ # set default plot options if necessary opts = subplot.opts if len(opts) == 0: opts['type'] = 'Gaussian KDE' opts['bins'] = 200 opts['transformAuto'] = True opts['xTransform'] = '' opts['yTransform'] = '' opts['kdeDisplay'] = True # Set axes transforms if (opts['transformAuto']): opts['xTransform'] = 'log' opts['yTransform'] = 'linear' subplot.axes = figure.add_subplot(subplot.mnp, title=subplot.Title) subplot.axes.set_xlabel(subplot.Labels[dims[0]]) data = subplot.Data[:, dims[0]] if opts['xTransform'] == 'log': data = tm.getMethod('log')(data) # Kernel density estimation if opts['type'] == 'Gaussian KDE' or opts['type'] == 'Both': ind = np.linspace(np.min(data), np.max(data), data.shape[0]*.1) gkde = stats.gaussian_kde(data) kdepdf = gkde.evaluate(ind) subplot.axes.plot(ind, kdepdf, label='kde', color='blue') # Binned Histogram if opts['type'] != 'Gaussian KDE': #subplot.axes.hist(subplot.Data[:, dims[0]], bins=250, normed=True, histtype='bar',log=True) h, b = np.histogram(data, bins=opts['bins']) if opts['type'] == 'Both': h = tm.getMethod('log')(h) b = (b[:-1] + b[1:])/2.0 subplot.axes.plot(b, h) if opts['type'] == 'Both': dataMax = max(np.max(kdepdf), np.max(h)) subplot.axes.set_ylim(0, dataMax + 0.1)
def bakker_kMeans(data, **kwargs): """ This is an implementation of the k-means algorithm designed specifically for flow cytometry data in the following paper: T.C.B. Schut, B.G.D. Grooth, and J. Greve, "Cluster analysis of flow cytometric list mode data on a personal computer", Cytometry, vol. 14, 1993, pp. 649-659. @type data: array @param data: The data to be clustered @type kwargs: dict @param kwargs: The following args are accepted: - numClusters: The number of clusters to form @rtype: tuple @return: A list where each element indicates the cluster membership of the corresponding index in the original data and a message string """ k = 1 initClusters = 200 msg = '' if 'numClusters' in kwargs.keys(): k = int(kwargs['numClusters']) if 'initClusters' in kwargs.keys(): initClusters = int(kwargs['numClusters']) # Log transform logData = tm.getMethod('log')(data) # Choose large # (200 as suggested by authors) of non-random initial centers centers = util.kinit(logData, initClusters) # Run k-means _, ids = kmeans2(logData, np.array(centers), minit='matrix') # Merge clusters w/special comparison metric until user cluster # achieved clusters = util.separate(logData, ids) finalIDs = merge(k, ids, clusters) return finalIDs, msg